imsciences 0.9.6.9__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +2 -2
- imsciences/geo.py +173 -115
- imsciences/mmm.py +930 -409
- imsciences/pull.py +1952 -1154
- imsciences/unittesting.py +729 -478
- imsciences/vis.py +669 -126
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/METADATA +1 -1
- imsciences-1.0.1.dist-info/RECORD +12 -0
- imsciences-0.9.6.9.dist-info/RECORD +0 -12
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/LICENSE.txt +0 -0
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/PKG-INFO-TomG-HP-290722 +0 -0
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/WHEEL +0 -0
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/top_level.txt +0 -0
imsciences/unittesting.py
CHANGED
|
@@ -1,25 +1,27 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import unittest
|
|
2
|
-
|
|
3
|
+
|
|
3
4
|
import numpy as np
|
|
4
|
-
import
|
|
5
|
+
import pandas as pd
|
|
5
6
|
from mmm import dataprocessing
|
|
6
7
|
|
|
8
|
+
|
|
7
9
|
class TestDataProcessor(unittest.TestCase):
|
|
8
|
-
|
|
9
10
|
def setUp(self):
|
|
10
11
|
self.dp = dataprocessing()
|
|
11
|
-
self.df = pd.DataFrame(
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
12
|
+
self.df = pd.DataFrame(
|
|
13
|
+
{
|
|
14
|
+
"date": pd.date_range(start="2023-01-01", periods=10, freq="D"),
|
|
15
|
+
"value1": range(10),
|
|
16
|
+
"value2": range(10, 20),
|
|
17
|
+
},
|
|
18
|
+
)
|
|
19
|
+
self.mixed_date_df = pd.DataFrame(
|
|
20
|
+
{"mixed_date": ["2023-01-01", "01/02/2023", "2023/03/01", "2023-04-01"]},
|
|
21
|
+
)
|
|
22
|
+
self.merged_df = pd.DataFrame(
|
|
23
|
+
{"col1": ["A", "B", "C"], "col2": ["X", "Y", "Z"]},
|
|
24
|
+
)
|
|
23
25
|
|
|
24
26
|
def test_get_wd_levels(self):
|
|
25
27
|
current_dir = os.getcwd()
|
|
@@ -29,258 +31,345 @@ class TestDataProcessor(unittest.TestCase):
|
|
|
29
31
|
def test_aggregate_daily_to_wc_long(self):
|
|
30
32
|
# Create a test DataFrame
|
|
31
33
|
test_data = {
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
"date": [
|
|
35
|
+
"2023-01-01",
|
|
36
|
+
"2023-01-02",
|
|
37
|
+
"2023-01-08",
|
|
38
|
+
"2023-01-09",
|
|
39
|
+
"2023-01-10",
|
|
40
|
+
],
|
|
41
|
+
"group_col": ["A", "A", "B", "B", "B"],
|
|
42
|
+
"value1": [10, 20, 30, 40, np.nan],
|
|
43
|
+
"value2": [100, 200, 300, np.nan, 500],
|
|
36
44
|
}
|
|
37
45
|
df = pd.DataFrame(test_data)
|
|
38
46
|
|
|
39
47
|
# Expected output for different test cases
|
|
40
|
-
expected_sum_output = pd.DataFrame(
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
48
|
+
expected_sum_output = pd.DataFrame(
|
|
49
|
+
{
|
|
50
|
+
"OBS": ["2023-01-01", "2023-01-08"], # Week starting on Sunday
|
|
51
|
+
"group_col": ["A", "B"],
|
|
52
|
+
"value1": [30.0, 70.0],
|
|
53
|
+
"value2": [300.0, 800.0],
|
|
54
|
+
},
|
|
55
|
+
)
|
|
46
56
|
|
|
47
57
|
# Convert OBS column to datetime for expected DataFrame
|
|
48
|
-
expected_sum_output[
|
|
58
|
+
expected_sum_output["OBS"] = pd.to_datetime(expected_sum_output["OBS"])
|
|
49
59
|
|
|
50
60
|
# Test sum aggregation
|
|
51
|
-
result_sum = self.dp.aggregate_daily_to_wc_long(
|
|
61
|
+
result_sum = self.dp.aggregate_daily_to_wc_long(
|
|
62
|
+
df,
|
|
63
|
+
"date",
|
|
64
|
+
["group_col"],
|
|
65
|
+
["value1", "value2"],
|
|
66
|
+
wc="sun",
|
|
67
|
+
aggregation="sum",
|
|
68
|
+
)
|
|
52
69
|
|
|
53
70
|
# Ensure both OBS columns are datetime for comparison
|
|
54
|
-
result_sum[
|
|
71
|
+
result_sum["OBS"] = pd.to_datetime(result_sum["OBS"])
|
|
55
72
|
|
|
56
73
|
# Compare the resulting DataFrame with the expected DataFrame
|
|
57
74
|
pd.testing.assert_frame_equal(result_sum, expected_sum_output)
|
|
58
|
-
|
|
75
|
+
|
|
59
76
|
def test_convert_monthly_to_daily(self):
|
|
60
77
|
# Create a test DataFrame with monthly data
|
|
61
78
|
test_data = {
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
79
|
+
"date": ["2023-01-01", "2023-02-01", "2023-03-01"],
|
|
80
|
+
"value1": [31, 28, 31],
|
|
81
|
+
"value2": [310, 280, 310],
|
|
65
82
|
}
|
|
66
83
|
df = pd.DataFrame(test_data)
|
|
67
84
|
|
|
68
85
|
# Expected output DataFrame when divide=True
|
|
69
86
|
expected_daily_data_divide = {
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
87
|
+
"date": pd.date_range(start="2023-01-01", end="2023-01-31").tolist()
|
|
88
|
+
+ pd.date_range(start="2023-02-01", end="2023-02-28").tolist()
|
|
89
|
+
+ pd.date_range(start="2023-03-01", end="2023-03-31").tolist(),
|
|
90
|
+
"value1": [1.0] * 31 + [1.0] * 28 + [1.0] * 31,
|
|
91
|
+
"value2": [10.0] * 31 + [10.0] * 28 + [10.0] * 31,
|
|
75
92
|
}
|
|
76
93
|
expected_daily_df_divide = pd.DataFrame(expected_daily_data_divide)
|
|
77
94
|
|
|
78
95
|
# Call the function with divide=True
|
|
79
|
-
result_divide = self.dp.convert_monthly_to_daily(df,
|
|
96
|
+
result_divide = self.dp.convert_monthly_to_daily(df, "date", divide=True)
|
|
80
97
|
|
|
81
98
|
# Compare the resulting DataFrame with the expected DataFrame
|
|
82
|
-
pd.testing.assert_frame_equal(
|
|
99
|
+
pd.testing.assert_frame_equal(
|
|
100
|
+
result_divide.reset_index(drop=True),
|
|
101
|
+
expected_daily_df_divide,
|
|
102
|
+
)
|
|
83
103
|
|
|
84
104
|
# Expected output DataFrame when divide=False
|
|
85
105
|
expected_daily_data_no_divide = {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
106
|
+
"date": pd.date_range(start="2023-01-01", end="2023-01-31").tolist()
|
|
107
|
+
+ pd.date_range(start="2023-02-01", end="2023-02-28").tolist()
|
|
108
|
+
+ pd.date_range(start="2023-03-01", end="2023-03-31").tolist(),
|
|
109
|
+
"value1": [31] * 31 + [28] * 28 + [31] * 31,
|
|
110
|
+
"value2": [310] * 31 + [280] * 28 + [310] * 31,
|
|
91
111
|
}
|
|
92
112
|
expected_daily_df_no_divide = pd.DataFrame(expected_daily_data_no_divide)
|
|
93
113
|
|
|
94
114
|
# Call the function with divide=False
|
|
95
|
-
result_no_divide = self.dp.convert_monthly_to_daily(df,
|
|
115
|
+
result_no_divide = self.dp.convert_monthly_to_daily(df, "date", divide=False)
|
|
96
116
|
|
|
97
117
|
# Compare the resulting DataFrame with the expected DataFrame
|
|
98
|
-
pd.testing.assert_frame_equal(
|
|
118
|
+
pd.testing.assert_frame_equal(
|
|
119
|
+
result_no_divide.reset_index(drop=True),
|
|
120
|
+
expected_daily_df_no_divide,
|
|
121
|
+
)
|
|
99
122
|
|
|
100
123
|
def test_week_of_year_mapping(self):
|
|
101
124
|
# Create a test DataFrame with ISO week format
|
|
102
|
-
test_data = {
|
|
103
|
-
'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52']
|
|
104
|
-
}
|
|
125
|
+
test_data = {"week_col": ["2023-W01", "2023-W05", "2023-W10", "2023-W52"]}
|
|
105
126
|
df = pd.DataFrame(test_data)
|
|
106
127
|
|
|
107
128
|
# Expected outputs for different start days
|
|
108
|
-
expected_output_mon = pd.DataFrame(
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
129
|
+
expected_output_mon = pd.DataFrame(
|
|
130
|
+
{
|
|
131
|
+
"week_col": ["2023-W01", "2023-W05", "2023-W10", "2023-W52"],
|
|
132
|
+
"OBS": ["02/01/2023", "30/01/2023", "06/03/2023", "25/12/2023"],
|
|
133
|
+
},
|
|
134
|
+
)
|
|
112
135
|
|
|
113
|
-
expected_output_sun = pd.DataFrame(
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
136
|
+
expected_output_sun = pd.DataFrame(
|
|
137
|
+
{
|
|
138
|
+
"week_col": ["2023-W01", "2023-W05", "2023-W10", "2023-W52"],
|
|
139
|
+
"OBS": ["01/01/2023", "29/01/2023", "05/03/2023", "24/12/2023"],
|
|
140
|
+
},
|
|
141
|
+
)
|
|
117
142
|
|
|
118
143
|
# Test mapping with Monday as start day
|
|
119
|
-
result_mon = self.dp.week_of_year_mapping(df.copy(),
|
|
144
|
+
result_mon = self.dp.week_of_year_mapping(df.copy(), "week_col", "mon")
|
|
120
145
|
pd.testing.assert_frame_equal(result_mon, expected_output_mon)
|
|
121
146
|
|
|
122
147
|
# Test mapping with Sunday as start day
|
|
123
|
-
result_sun = self.dp.week_of_year_mapping(df.copy(),
|
|
148
|
+
result_sun = self.dp.week_of_year_mapping(df.copy(), "week_col", "sun")
|
|
124
149
|
pd.testing.assert_frame_equal(result_sun, expected_output_sun)
|
|
125
150
|
|
|
126
151
|
# Test with invalid start day input
|
|
127
152
|
with self.assertRaises(ValueError) as context:
|
|
128
|
-
self.dp.week_of_year_mapping(df.copy(),
|
|
153
|
+
self.dp.week_of_year_mapping(df.copy(), "week_col", "invalid_day")
|
|
129
154
|
self.assertIn("Invalid day input", str(context.exception))
|
|
130
155
|
|
|
131
156
|
def test_rename_cols(self):
|
|
132
157
|
# Create a test DataFrame
|
|
133
158
|
test_data = {
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
159
|
+
"OBS": [1, 2, 3],
|
|
160
|
+
"Column One": [10, 20, 30],
|
|
161
|
+
"Another Column": [100, 200, 300],
|
|
162
|
+
"Special Characters !@#": [5, 15, 25],
|
|
138
163
|
}
|
|
139
164
|
df = pd.DataFrame(test_data)
|
|
140
165
|
|
|
141
166
|
# Expected output with default prefix
|
|
142
|
-
expected_output_default = pd.DataFrame(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
167
|
+
expected_output_default = pd.DataFrame(
|
|
168
|
+
{
|
|
169
|
+
"OBS": [1, 2, 3],
|
|
170
|
+
"ame_column_one": [10, 20, 30],
|
|
171
|
+
"ame_another_column": [100, 200, 300],
|
|
172
|
+
"ame_special_characters_!@#": [5, 15, 25],
|
|
173
|
+
},
|
|
174
|
+
)
|
|
148
175
|
|
|
149
176
|
# Expected output with custom prefix
|
|
150
|
-
expected_output_custom = pd.DataFrame(
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
177
|
+
expected_output_custom = pd.DataFrame(
|
|
178
|
+
{
|
|
179
|
+
"OBS": [1, 2, 3],
|
|
180
|
+
"custom_column_one": [10, 20, 30],
|
|
181
|
+
"custom_another_column": [100, 200, 300],
|
|
182
|
+
"custom_special_characters_!@#": [5, 15, 25],
|
|
183
|
+
},
|
|
184
|
+
)
|
|
156
185
|
|
|
157
186
|
# Test renaming columns with default prefix
|
|
158
187
|
result_default = self.dp.rename_cols(df)
|
|
159
188
|
pd.testing.assert_frame_equal(result_default, expected_output_default)
|
|
160
189
|
|
|
161
190
|
# Test renaming columns with custom prefix
|
|
162
|
-
result_custom = self.dp.rename_cols(df, name=
|
|
191
|
+
result_custom = self.dp.rename_cols(df, name="custom_")
|
|
163
192
|
pd.testing.assert_frame_equal(result_custom, expected_output_custom)
|
|
164
193
|
|
|
165
194
|
# Test that 'OBS' column remains unchanged
|
|
166
|
-
self.assertIn(
|
|
167
|
-
self.assertIn(
|
|
195
|
+
self.assertIn("OBS", result_default.columns)
|
|
196
|
+
self.assertIn("OBS", result_custom.columns)
|
|
168
197
|
|
|
169
198
|
def test_merge_new_and_old(self):
|
|
170
199
|
# Create test DataFrames for old and new data
|
|
171
200
|
old_data = {
|
|
172
|
-
|
|
173
|
-
|
|
201
|
+
"OBS": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04"],
|
|
202
|
+
"old_values": [10, 20, 30, 40],
|
|
174
203
|
}
|
|
175
204
|
new_data = {
|
|
176
|
-
|
|
177
|
-
|
|
205
|
+
"OBS": ["2023-01-04", "2023-01-05", "2023-01-06"],
|
|
206
|
+
"new_values": [100, 200, 300],
|
|
178
207
|
}
|
|
179
208
|
old_df = pd.DataFrame(old_data)
|
|
180
209
|
new_df = pd.DataFrame(new_data)
|
|
181
210
|
|
|
182
211
|
# Expected output
|
|
183
|
-
expected_output = pd.DataFrame(
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
212
|
+
expected_output = pd.DataFrame(
|
|
213
|
+
{
|
|
214
|
+
"OBS": pd.to_datetime(
|
|
215
|
+
[
|
|
216
|
+
"2023-01-01",
|
|
217
|
+
"2023-01-02",
|
|
218
|
+
"2023-01-03",
|
|
219
|
+
"2023-01-04",
|
|
220
|
+
"2023-01-05",
|
|
221
|
+
"2023-01-06",
|
|
222
|
+
],
|
|
223
|
+
),
|
|
224
|
+
"new_values": [10, 20, 30, 40, 200, 300],
|
|
225
|
+
},
|
|
226
|
+
)
|
|
187
227
|
|
|
188
228
|
# Test merging with cutoff_date='2023-01-04'
|
|
189
|
-
result = self.dp.merge_new_and_old(
|
|
229
|
+
result = self.dp.merge_new_and_old(
|
|
230
|
+
old_df,
|
|
231
|
+
"old_values",
|
|
232
|
+
new_df,
|
|
233
|
+
"new_values",
|
|
234
|
+
"2023-01-04",
|
|
235
|
+
)
|
|
190
236
|
|
|
191
237
|
# Assertions
|
|
192
238
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
193
239
|
|
|
194
240
|
# Test that columns are correctly renamed and sorted
|
|
195
|
-
self.assertIn(
|
|
196
|
-
self.assertIn(
|
|
241
|
+
self.assertIn("OBS", result.columns)
|
|
242
|
+
self.assertIn("new_values", result.columns)
|
|
197
243
|
self.assertEqual(len(result), len(expected_output)) # Ensure row count matches
|
|
198
|
-
self.assertTrue(
|
|
244
|
+
self.assertTrue(
|
|
245
|
+
(result["OBS"].diff().dropna() >= pd.Timedelta(0)).all(),
|
|
246
|
+
) # Check that dates are in order
|
|
199
247
|
|
|
200
248
|
def test_merge_dataframes_on_column(self):
|
|
201
249
|
# Create test DataFrames
|
|
202
|
-
df1 = pd.DataFrame(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
'OBS': ['2023-01-03', '2023-01-04', '2023-01-05'],
|
|
212
|
-
'value3': [70, 80, 90]
|
|
213
|
-
})
|
|
250
|
+
df1 = pd.DataFrame(
|
|
251
|
+
{"OBS": ["2023-01-01", "2023-01-02", "2023-01-03"], "value1": [10, 20, 30]},
|
|
252
|
+
)
|
|
253
|
+
df2 = pd.DataFrame(
|
|
254
|
+
{"OBS": ["2023-01-02", "2023-01-03", "2023-01-04"], "value2": [40, 50, 60]},
|
|
255
|
+
)
|
|
256
|
+
df3 = pd.DataFrame(
|
|
257
|
+
{"OBS": ["2023-01-03", "2023-01-04", "2023-01-05"], "value3": [70, 80, 90]},
|
|
258
|
+
)
|
|
214
259
|
|
|
215
260
|
# Ensure test DataFrame columns are datetime
|
|
216
|
-
df1[
|
|
217
|
-
df2[
|
|
218
|
-
df3[
|
|
261
|
+
df1["OBS"] = pd.to_datetime(df1["OBS"])
|
|
262
|
+
df2["OBS"] = pd.to_datetime(df2["OBS"])
|
|
263
|
+
df3["OBS"] = pd.to_datetime(df3["OBS"])
|
|
219
264
|
|
|
220
265
|
# Expected output for outer merge (cast to float64 to match the behavior of fillna)
|
|
221
|
-
expected_output_outer = pd.DataFrame(
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
266
|
+
expected_output_outer = pd.DataFrame(
|
|
267
|
+
{
|
|
268
|
+
"OBS": pd.to_datetime(
|
|
269
|
+
[
|
|
270
|
+
"2023-01-01",
|
|
271
|
+
"2023-01-02",
|
|
272
|
+
"2023-01-03",
|
|
273
|
+
"2023-01-04",
|
|
274
|
+
"2023-01-05",
|
|
275
|
+
],
|
|
276
|
+
),
|
|
277
|
+
"value1": [10.0, 20.0, 30.0, 0.0, 0.0],
|
|
278
|
+
"value2": [0.0, 40.0, 50.0, 60.0, 0.0],
|
|
279
|
+
"value3": [0.0, 0.0, 70.0, 80.0, 90.0],
|
|
280
|
+
},
|
|
281
|
+
)
|
|
227
282
|
|
|
228
283
|
# Expected output for inner merge
|
|
229
|
-
expected_output_inner = pd.DataFrame(
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
284
|
+
expected_output_inner = pd.DataFrame(
|
|
285
|
+
{
|
|
286
|
+
"OBS": pd.to_datetime(["2023-01-03"]),
|
|
287
|
+
"value1": [30],
|
|
288
|
+
"value2": [50],
|
|
289
|
+
"value3": [70],
|
|
290
|
+
},
|
|
291
|
+
)
|
|
235
292
|
|
|
236
293
|
# Test outer merge
|
|
237
|
-
result_outer = self.dp.merge_dataframes_on_column(
|
|
238
|
-
|
|
294
|
+
result_outer = self.dp.merge_dataframes_on_column(
|
|
295
|
+
[df1, df2, df3],
|
|
296
|
+
common_column="OBS",
|
|
297
|
+
merge_how="outer",
|
|
298
|
+
)
|
|
299
|
+
pd.testing.assert_frame_equal(
|
|
300
|
+
result_outer.reset_index(drop=True),
|
|
301
|
+
expected_output_outer,
|
|
302
|
+
)
|
|
239
303
|
|
|
240
304
|
# Test inner merge
|
|
241
|
-
result_inner = self.dp.merge_dataframes_on_column(
|
|
242
|
-
|
|
305
|
+
result_inner = self.dp.merge_dataframes_on_column(
|
|
306
|
+
[df1, df2, df3],
|
|
307
|
+
common_column="OBS",
|
|
308
|
+
merge_how="inner",
|
|
309
|
+
)
|
|
310
|
+
pd.testing.assert_frame_equal(
|
|
311
|
+
result_inner.reset_index(drop=True),
|
|
312
|
+
expected_output_inner,
|
|
313
|
+
)
|
|
243
314
|
|
|
244
315
|
# Test with empty DataFrame list
|
|
245
|
-
result_empty = self.dp.merge_dataframes_on_column(
|
|
316
|
+
result_empty = self.dp.merge_dataframes_on_column(
|
|
317
|
+
[],
|
|
318
|
+
common_column="OBS",
|
|
319
|
+
merge_how="outer",
|
|
320
|
+
)
|
|
246
321
|
self.assertIsNone(result_empty)
|
|
247
322
|
|
|
248
323
|
# Test with one DataFrame in the list
|
|
249
|
-
result_single = self.dp.merge_dataframes_on_column(
|
|
324
|
+
result_single = self.dp.merge_dataframes_on_column(
|
|
325
|
+
[df1],
|
|
326
|
+
common_column="OBS",
|
|
327
|
+
merge_how="outer",
|
|
328
|
+
)
|
|
250
329
|
pd.testing.assert_frame_equal(result_single.reset_index(drop=True), df1)
|
|
251
330
|
|
|
252
331
|
# Test that the common column is sorted and converted to datetime
|
|
253
|
-
self.assertTrue(pd.api.types.is_datetime64_any_dtype(result_outer[
|
|
254
|
-
self.assertTrue(
|
|
332
|
+
self.assertTrue(pd.api.types.is_datetime64_any_dtype(result_outer["OBS"]))
|
|
333
|
+
self.assertTrue(
|
|
334
|
+
(result_outer["OBS"].diff().dropna() >= pd.Timedelta(0)).all(),
|
|
335
|
+
) # Check sorted dates
|
|
255
336
|
|
|
256
337
|
def test_merge_and_update_dfs(self):
|
|
257
338
|
# Create test DataFrames
|
|
258
|
-
df1 = pd.DataFrame(
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
339
|
+
df1 = pd.DataFrame(
|
|
340
|
+
{
|
|
341
|
+
"OBS": ["2023-01-01", "2023-01-02", "2023-01-03"],
|
|
342
|
+
"value1": [10, 20, 30],
|
|
343
|
+
"value2": [100, 200, 300],
|
|
344
|
+
},
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
df2 = pd.DataFrame(
|
|
348
|
+
{
|
|
349
|
+
"OBS": ["2023-01-02", "2023-01-03", "2023-01-04"],
|
|
350
|
+
"value1": [15, 25, 35], # Updates for value1
|
|
351
|
+
"value3": [400, 500, 600], # New column
|
|
352
|
+
},
|
|
353
|
+
)
|
|
269
354
|
|
|
270
355
|
# Ensure test DataFrame columns are datetime
|
|
271
|
-
df1[
|
|
272
|
-
df2[
|
|
356
|
+
df1["OBS"] = pd.to_datetime(df1["OBS"])
|
|
357
|
+
df2["OBS"] = pd.to_datetime(df2["OBS"])
|
|
273
358
|
|
|
274
359
|
# Expected output with float64 for numeric columns
|
|
275
|
-
expected_output = pd.DataFrame(
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
360
|
+
expected_output = pd.DataFrame(
|
|
361
|
+
{
|
|
362
|
+
"OBS": pd.to_datetime(
|
|
363
|
+
["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04"],
|
|
364
|
+
),
|
|
365
|
+
"value1": [10.0, 15.0, 25.0, 35.0], # Updated where applicable
|
|
366
|
+
"value2": [100.0, 200.0, 300.0, 0.0], # From df1, 0 where not available
|
|
367
|
+
"value3": [0.0, 400.0, 500.0, 600.0], # From df2, 0 where not available
|
|
368
|
+
},
|
|
369
|
+
)
|
|
281
370
|
|
|
282
371
|
# Test the merge and update function
|
|
283
|
-
result = self.dp.merge_and_update_dfs(df1, df2, key_column=
|
|
372
|
+
result = self.dp.merge_and_update_dfs(df1, df2, key_column="OBS")
|
|
284
373
|
|
|
285
374
|
# Assertions
|
|
286
375
|
pd.testing.assert_frame_equal(result.reset_index(drop=True), expected_output)
|
|
@@ -289,187 +378,256 @@ class TestDataProcessor(unittest.TestCase):
|
|
|
289
378
|
self.assertListEqual(list(result.columns), list(expected_output.columns))
|
|
290
379
|
|
|
291
380
|
# Test that the OBS column is sorted
|
|
292
|
-
self.assertTrue((result[
|
|
381
|
+
self.assertTrue((result["OBS"].diff().dropna() >= pd.Timedelta(0)).all())
|
|
293
382
|
|
|
294
383
|
def test_convert_us_to_uk_dates(self):
|
|
295
384
|
# Create a test DataFrame
|
|
296
385
|
test_data = {
|
|
297
|
-
|
|
386
|
+
"date_col": ["01-02-2023", "03/04/2023", "05-06-2023", "07/08/2023"],
|
|
298
387
|
}
|
|
299
388
|
df = pd.DataFrame(test_data)
|
|
300
389
|
|
|
301
390
|
# Expected output
|
|
302
|
-
expected_output = pd.DataFrame(
|
|
303
|
-
|
|
304
|
-
|
|
391
|
+
expected_output = pd.DataFrame(
|
|
392
|
+
{
|
|
393
|
+
"date_col": pd.to_datetime(
|
|
394
|
+
["2023-01-02", "2023-03-04", "2023-05-06", "2023-07-08"],
|
|
395
|
+
),
|
|
396
|
+
},
|
|
397
|
+
)
|
|
305
398
|
|
|
306
399
|
# Test the conversion function
|
|
307
|
-
result = self.dp.convert_us_to_uk_dates(df.copy(),
|
|
400
|
+
result = self.dp.convert_us_to_uk_dates(df.copy(), "date_col")
|
|
308
401
|
|
|
309
402
|
# Assertions
|
|
310
403
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
311
404
|
|
|
312
405
|
# Test invalid input formats
|
|
313
|
-
invalid_data = pd.DataFrame({
|
|
406
|
+
invalid_data = pd.DataFrame({"date_col": ["invalid-date", "12345"]})
|
|
314
407
|
with self.assertRaises(ValueError):
|
|
315
|
-
self.dp.convert_us_to_uk_dates(invalid_data.copy(),
|
|
408
|
+
self.dp.convert_us_to_uk_dates(invalid_data.copy(), "date_col")
|
|
316
409
|
|
|
317
410
|
# Test missing values
|
|
318
|
-
missing_data = pd.DataFrame({
|
|
319
|
-
result_with_missing = self.dp.convert_us_to_uk_dates(
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
411
|
+
missing_data = pd.DataFrame({"date_col": [None, "03/04/2023"]})
|
|
412
|
+
result_with_missing = self.dp.convert_us_to_uk_dates(
|
|
413
|
+
missing_data.copy(),
|
|
414
|
+
"date_col",
|
|
415
|
+
)
|
|
416
|
+
expected_with_missing = pd.DataFrame(
|
|
417
|
+
{"date_col": [pd.NaT, pd.to_datetime("2023-03-04")]},
|
|
418
|
+
)
|
|
323
419
|
pd.testing.assert_frame_equal(result_with_missing, expected_with_missing)
|
|
324
420
|
|
|
325
421
|
def test_pivot_table(self):
|
|
326
422
|
# Create a test DataFrame
|
|
327
423
|
test_data = {
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
424
|
+
"date": [
|
|
425
|
+
"2023-01-01",
|
|
426
|
+
"2023-01-01",
|
|
427
|
+
"2023-01-02",
|
|
428
|
+
"2023-01-02",
|
|
429
|
+
"2023-01-03",
|
|
430
|
+
],
|
|
431
|
+
"category": ["A", "B", "A", "B", "A"],
|
|
432
|
+
"value": [10.0, 20.0, 30.0, 40.0, 50.0],
|
|
331
433
|
}
|
|
332
434
|
df = pd.DataFrame(test_data)
|
|
333
435
|
|
|
334
436
|
# Ensure the 'date' column is in datetime format
|
|
335
|
-
df[
|
|
437
|
+
df["date"] = pd.to_datetime(df["date"])
|
|
336
438
|
|
|
337
439
|
# Expected output for basic pivot table
|
|
338
|
-
expected_output_basic = pd.DataFrame(
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
440
|
+
expected_output_basic = pd.DataFrame(
|
|
441
|
+
{
|
|
442
|
+
"date": pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03"]),
|
|
443
|
+
"A": [10.0, 30.0, 50.0], # Cast to float64
|
|
444
|
+
"B": [20.0, 40.0, 0.0], # Cast to float64
|
|
445
|
+
},
|
|
446
|
+
)
|
|
447
|
+
expected_output_basic.columns.name = "category"
|
|
344
448
|
|
|
345
449
|
# Test basic pivot table
|
|
346
|
-
result_basic = self.dp.pivot_table(
|
|
450
|
+
result_basic = self.dp.pivot_table(
|
|
451
|
+
df.copy(),
|
|
452
|
+
index_col="date",
|
|
453
|
+
columns="category",
|
|
454
|
+
values_col="value",
|
|
455
|
+
margins=False,
|
|
456
|
+
fill_value=0,
|
|
457
|
+
)
|
|
347
458
|
|
|
348
459
|
# Convert 'date' columns in both DataFrames to datetime for comparison
|
|
349
|
-
result_basic[
|
|
350
|
-
expected_output_basic[
|
|
460
|
+
result_basic["date"] = pd.to_datetime(result_basic["date"])
|
|
461
|
+
expected_output_basic["date"] = pd.to_datetime(expected_output_basic["date"])
|
|
351
462
|
pd.testing.assert_frame_equal(result_basic, expected_output_basic)
|
|
352
463
|
|
|
353
464
|
# Expected output for pivot table with margins
|
|
354
|
-
expected_output_with_margins = pd.DataFrame(
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
465
|
+
expected_output_with_margins = pd.DataFrame(
|
|
466
|
+
{
|
|
467
|
+
"date": ["2023-01-01", "2023-01-02", "2023-01-03", "Total"],
|
|
468
|
+
"A": [10.0, 30.0, 50.0, 90.0],
|
|
469
|
+
"B": [20.0, 40.0, 0.0, 60.0],
|
|
470
|
+
"Total": [30.0, 70.0, 50.0, 150.0],
|
|
471
|
+
},
|
|
472
|
+
)
|
|
473
|
+
expected_output_with_margins["date"] = pd.to_datetime(
|
|
474
|
+
expected_output_with_margins["date"],
|
|
475
|
+
errors="coerce",
|
|
476
|
+
).fillna("Total")
|
|
477
|
+
expected_output_with_margins.columns.name = "category"
|
|
364
478
|
|
|
365
479
|
# Test pivot table with margins
|
|
366
|
-
result_with_margins = self.dp.pivot_table(
|
|
367
|
-
|
|
480
|
+
result_with_margins = self.dp.pivot_table(
|
|
481
|
+
df.copy(),
|
|
482
|
+
index_col="date",
|
|
483
|
+
columns="category",
|
|
484
|
+
values_col="value",
|
|
485
|
+
margins=True,
|
|
486
|
+
fill_value=0,
|
|
487
|
+
)
|
|
488
|
+
result_with_margins["date"] = pd.to_datetime(
|
|
489
|
+
result_with_margins["date"],
|
|
490
|
+
errors="coerce",
|
|
491
|
+
).fillna("Total")
|
|
368
492
|
pd.testing.assert_frame_equal(result_with_margins, expected_output_with_margins)
|
|
369
493
|
|
|
370
494
|
def test_apply_lookup_table_for_columns(self):
|
|
371
495
|
# Create a test DataFrame
|
|
372
496
|
test_data = {
|
|
373
|
-
|
|
374
|
-
|
|
497
|
+
"col1": ["apple", "banana", "carrot", "date", "eggplant"],
|
|
498
|
+
"col2": ["fruit", "fruit", "vegetable", "fruit", "vegetable"],
|
|
375
499
|
}
|
|
376
500
|
df = pd.DataFrame(test_data)
|
|
377
501
|
|
|
378
502
|
# Lookup dictionary
|
|
379
503
|
lookup_dict = {
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
504
|
+
"apple": "Red Fruit",
|
|
505
|
+
"banana": "Yellow Fruit",
|
|
506
|
+
"carrot": "Orange Vegetable",
|
|
507
|
+
"date": "Brown Fruit",
|
|
384
508
|
}
|
|
385
509
|
|
|
386
510
|
# Expected output with single column lookup
|
|
387
511
|
expected_output_single = df.copy()
|
|
388
|
-
expected_output_single[
|
|
512
|
+
expected_output_single["Mapping"] = [
|
|
513
|
+
"Red Fruit",
|
|
514
|
+
"Yellow Fruit",
|
|
515
|
+
"Orange Vegetable",
|
|
516
|
+
"Brown Fruit",
|
|
517
|
+
"Other",
|
|
518
|
+
]
|
|
389
519
|
|
|
390
520
|
# Test with a single column
|
|
391
|
-
result_single = self.dp.apply_lookup_table_for_columns(
|
|
521
|
+
result_single = self.dp.apply_lookup_table_for_columns(
|
|
522
|
+
df.copy(),
|
|
523
|
+
col_names=["col1"],
|
|
524
|
+
to_find_dict=lookup_dict,
|
|
525
|
+
)
|
|
392
526
|
pd.testing.assert_frame_equal(result_single, expected_output_single)
|
|
393
527
|
|
|
394
528
|
# Expected output with multiple column lookup
|
|
395
529
|
expected_output_multiple = df.copy()
|
|
396
|
-
expected_output_multiple[
|
|
530
|
+
expected_output_multiple["Mapping"] = [
|
|
531
|
+
"Other",
|
|
532
|
+
"Other",
|
|
533
|
+
"Other",
|
|
534
|
+
"Brown Fruit",
|
|
535
|
+
"Other",
|
|
536
|
+
]
|
|
397
537
|
|
|
398
538
|
# Update lookup dictionary to match merged keys
|
|
399
|
-
lookup_dict_merged = {
|
|
400
|
-
'date|fruit': 'Brown Fruit'
|
|
401
|
-
}
|
|
539
|
+
lookup_dict_merged = {"date|fruit": "Brown Fruit"}
|
|
402
540
|
|
|
403
541
|
# Test with multiple columns
|
|
404
|
-
result_multiple = self.dp.apply_lookup_table_for_columns(
|
|
542
|
+
result_multiple = self.dp.apply_lookup_table_for_columns(
|
|
543
|
+
df.copy(),
|
|
544
|
+
col_names=["col1", "col2"],
|
|
545
|
+
to_find_dict=lookup_dict_merged,
|
|
546
|
+
)
|
|
405
547
|
pd.testing.assert_frame_equal(result_multiple, expected_output_multiple)
|
|
406
548
|
|
|
407
549
|
# Test case where no match is found
|
|
408
|
-
df_no_match = pd.DataFrame({
|
|
550
|
+
df_no_match = pd.DataFrame({"col1": ["unknown"]})
|
|
409
551
|
expected_no_match = df_no_match.copy()
|
|
410
|
-
expected_no_match[
|
|
411
|
-
result_no_match = self.dp.apply_lookup_table_for_columns(
|
|
552
|
+
expected_no_match["Mapping"] = ["Other"]
|
|
553
|
+
result_no_match = self.dp.apply_lookup_table_for_columns(
|
|
554
|
+
df_no_match,
|
|
555
|
+
col_names=["col1"],
|
|
556
|
+
to_find_dict=lookup_dict,
|
|
557
|
+
)
|
|
412
558
|
pd.testing.assert_frame_equal(result_no_match, expected_no_match)
|
|
413
559
|
|
|
414
560
|
def test_aggregate_daily_to_wc_wide(self):
|
|
415
561
|
# Create a test DataFrame
|
|
416
562
|
test_data = {
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
563
|
+
"date": [
|
|
564
|
+
"2023-01-01",
|
|
565
|
+
"2023-01-02",
|
|
566
|
+
"2023-01-08",
|
|
567
|
+
"2023-01-09",
|
|
568
|
+
"2023-01-10",
|
|
569
|
+
],
|
|
570
|
+
"group": ["A", "A", "B", "B", "B"],
|
|
571
|
+
"value1": [10, 20, 30, 40, None],
|
|
572
|
+
"value2": [100, 200, 300, None, 500],
|
|
421
573
|
}
|
|
422
574
|
df = pd.DataFrame(test_data)
|
|
423
575
|
|
|
424
576
|
# Expected output for weekly aggregation in wide format
|
|
425
|
-
expected_output = pd.DataFrame(
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
577
|
+
expected_output = pd.DataFrame(
|
|
578
|
+
{
|
|
579
|
+
"OBS": ["2023-01-01", "2023-01-08"], # Weeks starting on Sunday
|
|
580
|
+
"value1_A": [30.0, 0.0],
|
|
581
|
+
"value1_B": [0.0, 70.0],
|
|
582
|
+
"value2_A": [300.0, 0.0],
|
|
583
|
+
"value2_B": [0.0, 800.0],
|
|
584
|
+
"Total value1": [30.0, 70.0],
|
|
585
|
+
"Total value2": [300.0, 800.0],
|
|
586
|
+
},
|
|
587
|
+
)
|
|
434
588
|
|
|
435
589
|
# Test aggregation with totals included
|
|
436
590
|
result = self.dp.aggregate_daily_to_wc_wide(
|
|
437
591
|
df=df.copy(),
|
|
438
|
-
date_column=
|
|
439
|
-
group_columns=[
|
|
440
|
-
sum_columns=[
|
|
441
|
-
wc=
|
|
442
|
-
aggregation=
|
|
443
|
-
include_totals=True
|
|
592
|
+
date_column="date",
|
|
593
|
+
group_columns=["group"],
|
|
594
|
+
sum_columns=["value1", "value2"],
|
|
595
|
+
wc="sun",
|
|
596
|
+
aggregation="sum",
|
|
597
|
+
include_totals=True,
|
|
444
598
|
)
|
|
445
599
|
|
|
446
600
|
# Ensure 'OBS' columns are datetime for comparison
|
|
447
|
-
result[
|
|
448
|
-
expected_output[
|
|
601
|
+
result["OBS"] = pd.to_datetime(result["OBS"])
|
|
602
|
+
expected_output["OBS"] = pd.to_datetime(expected_output["OBS"])
|
|
449
603
|
|
|
450
604
|
# Compare the resulting DataFrame with the expected DataFrame
|
|
451
605
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
452
606
|
|
|
453
607
|
# Test without group columns (no totals, single wide column)
|
|
454
|
-
expected_output_no_group = pd.DataFrame(
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
608
|
+
expected_output_no_group = pd.DataFrame(
|
|
609
|
+
{
|
|
610
|
+
"OBS": ["2023-01-01", "2023-01-08"],
|
|
611
|
+
"value1": [30.0, 70.0],
|
|
612
|
+
"value2": [300.0, 800.0],
|
|
613
|
+
},
|
|
614
|
+
)
|
|
459
615
|
|
|
460
616
|
result_no_group = self.dp.aggregate_daily_to_wc_wide(
|
|
461
617
|
df=df.copy(),
|
|
462
|
-
date_column=
|
|
618
|
+
date_column="date",
|
|
463
619
|
group_columns=[],
|
|
464
|
-
sum_columns=[
|
|
465
|
-
wc=
|
|
466
|
-
aggregation=
|
|
467
|
-
include_totals=False
|
|
620
|
+
sum_columns=["value1", "value2"],
|
|
621
|
+
wc="sun",
|
|
622
|
+
aggregation="sum",
|
|
623
|
+
include_totals=False,
|
|
468
624
|
)
|
|
469
625
|
|
|
470
626
|
# Ensure 'OBS' columns are datetime for comparison
|
|
471
|
-
result_no_group[
|
|
472
|
-
expected_output_no_group[
|
|
627
|
+
result_no_group["OBS"] = pd.to_datetime(result_no_group["OBS"])
|
|
628
|
+
expected_output_no_group["OBS"] = pd.to_datetime(
|
|
629
|
+
expected_output_no_group["OBS"],
|
|
630
|
+
)
|
|
473
631
|
|
|
474
632
|
# Compare the resulting DataFrame with the expected DataFrame
|
|
475
633
|
pd.testing.assert_frame_equal(result_no_group, expected_output_no_group)
|
|
@@ -477,200 +635,233 @@ class TestDataProcessor(unittest.TestCase):
|
|
|
477
635
|
def test_merge_cols_with_seperator(self):
|
|
478
636
|
# Create a test DataFrame
|
|
479
637
|
test_data = {
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
638
|
+
"col1": ["apple", "banana", "cherry"],
|
|
639
|
+
"col2": ["red", "yellow", "red"],
|
|
640
|
+
"col3": ["fruit", "fruit", "fruit"],
|
|
483
641
|
}
|
|
484
642
|
df = pd.DataFrame(test_data)
|
|
485
643
|
|
|
486
644
|
# Test merging two columns with default separator
|
|
487
645
|
expected_output_default = df.copy()
|
|
488
|
-
expected_output_default[
|
|
646
|
+
expected_output_default["Merged"] = ["apple_red", "banana_yellow", "cherry_red"]
|
|
489
647
|
|
|
490
|
-
result_default = self.dp.merge_cols_with_seperator(
|
|
648
|
+
result_default = self.dp.merge_cols_with_seperator(
|
|
649
|
+
df.copy(),
|
|
650
|
+
col_names=["col1", "col2"],
|
|
651
|
+
)
|
|
491
652
|
pd.testing.assert_frame_equal(result_default, expected_output_default)
|
|
492
653
|
|
|
493
654
|
# Test merging three columns with custom separator
|
|
494
655
|
expected_output_custom = df.copy()
|
|
495
|
-
expected_output_custom[
|
|
656
|
+
expected_output_custom["Merged"] = [
|
|
657
|
+
"apple-red-fruit",
|
|
658
|
+
"banana-yellow-fruit",
|
|
659
|
+
"cherry-red-fruit",
|
|
660
|
+
]
|
|
496
661
|
|
|
497
|
-
result_custom = self.dp.merge_cols_with_seperator(
|
|
662
|
+
result_custom = self.dp.merge_cols_with_seperator(
|
|
663
|
+
df.copy(),
|
|
664
|
+
col_names=["col1", "col2", "col3"],
|
|
665
|
+
seperator="-",
|
|
666
|
+
)
|
|
498
667
|
pd.testing.assert_frame_equal(result_custom, expected_output_custom)
|
|
499
668
|
|
|
500
669
|
# Test merging with starting and ending prefix
|
|
501
670
|
expected_output_prefix = df.copy()
|
|
502
|
-
expected_output_prefix[
|
|
671
|
+
expected_output_prefix["Merged"] = [
|
|
672
|
+
"Start:apple_red:End",
|
|
673
|
+
"Start:banana_yellow:End",
|
|
674
|
+
"Start:cherry_red:End",
|
|
675
|
+
]
|
|
503
676
|
|
|
504
677
|
result_prefix = self.dp.merge_cols_with_seperator(
|
|
505
678
|
df.copy(),
|
|
506
|
-
col_names=[
|
|
507
|
-
seperator=
|
|
508
|
-
starting_prefix_str=
|
|
509
|
-
ending_prefix_str=
|
|
679
|
+
col_names=["col1", "col2"],
|
|
680
|
+
seperator="_",
|
|
681
|
+
starting_prefix_str="Start:",
|
|
682
|
+
ending_prefix_str=":End",
|
|
510
683
|
)
|
|
511
684
|
pd.testing.assert_frame_equal(result_prefix, expected_output_prefix)
|
|
512
685
|
|
|
513
686
|
# Test error for less than two columns
|
|
514
687
|
with self.assertRaises(ValueError):
|
|
515
|
-
self.dp.merge_cols_with_seperator(df.copy(), col_names=[
|
|
688
|
+
self.dp.merge_cols_with_seperator(df.copy(), col_names=["col1"])
|
|
516
689
|
|
|
517
690
|
def test_check_sum_of_df_cols_are_equal(self):
|
|
518
691
|
# Create test DataFrames
|
|
519
|
-
df1 = pd.DataFrame({
|
|
520
|
-
'col1': [1, 2, 3],
|
|
521
|
-
'col2': [4, 5, 6]
|
|
522
|
-
})
|
|
692
|
+
df1 = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
523
693
|
|
|
524
|
-
df2 = pd.DataFrame({
|
|
525
|
-
'colA': [1, 2, 3],
|
|
526
|
-
'colB': [4, 5, 6]
|
|
527
|
-
})
|
|
694
|
+
df2 = pd.DataFrame({"colA": [1, 2, 3], "colB": [4, 5, 6]})
|
|
528
695
|
|
|
529
|
-
df3 = pd.DataFrame({
|
|
530
|
-
'colX': [1, 2, 3],
|
|
531
|
-
'colY': [4, 5, 7]
|
|
532
|
-
})
|
|
696
|
+
df3 = pd.DataFrame({"colX": [1, 2, 3], "colY": [4, 5, 7]})
|
|
533
697
|
|
|
534
698
|
# Test case where sums are equal
|
|
535
|
-
result_equal = self.dp.check_sum_of_df_cols_are_equal(
|
|
699
|
+
result_equal = self.dp.check_sum_of_df_cols_are_equal(
|
|
700
|
+
df1,
|
|
701
|
+
df2,
|
|
702
|
+
cols_1=["col1", "col2"],
|
|
703
|
+
cols_2=["colA", "colB"],
|
|
704
|
+
)
|
|
536
705
|
self.assertEqual(result_equal[0], "They are equal")
|
|
537
706
|
self.assertEqual(result_equal[1], 21) # Sum of df1's columns
|
|
538
707
|
self.assertEqual(result_equal[2], 21) # Sum of df2's columns
|
|
539
708
|
|
|
540
709
|
# Test case where sums are not equal
|
|
541
|
-
result_not_equal = self.dp.check_sum_of_df_cols_are_equal(
|
|
710
|
+
result_not_equal = self.dp.check_sum_of_df_cols_are_equal(
|
|
711
|
+
df1,
|
|
712
|
+
df3,
|
|
713
|
+
cols_1=["col1", "col2"],
|
|
714
|
+
cols_2=["colX", "colY"],
|
|
715
|
+
)
|
|
542
716
|
self.assertTrue(result_not_equal[0].startswith("They are different by "))
|
|
543
717
|
self.assertEqual(result_not_equal[1], 21) # Sum of df1's columns
|
|
544
718
|
self.assertEqual(result_not_equal[2], 22) # Sum of df3's columns
|
|
545
719
|
|
|
546
720
|
# Test case with mismatched column names
|
|
547
721
|
with self.assertRaises(KeyError):
|
|
548
|
-
self.dp.check_sum_of_df_cols_are_equal(
|
|
722
|
+
self.dp.check_sum_of_df_cols_are_equal(
|
|
723
|
+
df1,
|
|
724
|
+
df2,
|
|
725
|
+
cols_1=["nonexistent_col"],
|
|
726
|
+
cols_2=["colA", "colB"],
|
|
727
|
+
)
|
|
549
728
|
|
|
550
729
|
# Test case with empty columns
|
|
551
|
-
result_empty_cols = self.dp.check_sum_of_df_cols_are_equal(
|
|
730
|
+
result_empty_cols = self.dp.check_sum_of_df_cols_are_equal(
|
|
731
|
+
df1,
|
|
732
|
+
df2,
|
|
733
|
+
cols_1=[],
|
|
734
|
+
cols_2=[],
|
|
735
|
+
)
|
|
552
736
|
self.assertEqual(result_empty_cols[1], 0) # Sum of empty columns
|
|
553
737
|
self.assertEqual(result_empty_cols[2], 0) # Sum of empty columns
|
|
554
738
|
self.assertEqual(result_empty_cols[0], "They are equal")
|
|
555
739
|
|
|
556
740
|
def test_convert_2_df_cols_to_dict(self):
|
|
557
741
|
# Create a test DataFrame
|
|
558
|
-
df = pd.DataFrame(
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
})
|
|
742
|
+
df = pd.DataFrame(
|
|
743
|
+
{"key_col": ["key1", "key2", "key3"], "value_col": [10, 20, 30]},
|
|
744
|
+
)
|
|
562
745
|
|
|
563
746
|
# Expected dictionary
|
|
564
|
-
expected_dict = {
|
|
565
|
-
'key1': 10,
|
|
566
|
-
'key2': 20,
|
|
567
|
-
'key3': 30
|
|
568
|
-
}
|
|
747
|
+
expected_dict = {"key1": 10, "key2": 20, "key3": 30}
|
|
569
748
|
|
|
570
749
|
# Test basic functionality
|
|
571
|
-
result = self.dp.convert_2_df_cols_to_dict(df,
|
|
750
|
+
result = self.dp.convert_2_df_cols_to_dict(df, "key_col", "value_col")
|
|
572
751
|
self.assertEqual(result, expected_dict)
|
|
573
752
|
|
|
574
753
|
# Test with non-unique keys
|
|
575
|
-
df_non_unique = pd.DataFrame(
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
})
|
|
754
|
+
df_non_unique = pd.DataFrame(
|
|
755
|
+
{"key_col": ["key1", "key2", "key1"], "value_col": [10, 20, 30]},
|
|
756
|
+
)
|
|
579
757
|
expected_dict_non_unique = {
|
|
580
|
-
|
|
581
|
-
|
|
758
|
+
"key1": 30, # Last occurrence of 'key1' should overwrite the earlier one
|
|
759
|
+
"key2": 20,
|
|
582
760
|
}
|
|
583
|
-
result_non_unique = self.dp.convert_2_df_cols_to_dict(
|
|
761
|
+
result_non_unique = self.dp.convert_2_df_cols_to_dict(
|
|
762
|
+
df_non_unique,
|
|
763
|
+
"key_col",
|
|
764
|
+
"value_col",
|
|
765
|
+
)
|
|
584
766
|
self.assertEqual(result_non_unique, expected_dict_non_unique)
|
|
585
767
|
|
|
586
768
|
# Test with missing key or value column
|
|
587
769
|
with self.assertRaises(ValueError):
|
|
588
|
-
self.dp.convert_2_df_cols_to_dict(df,
|
|
770
|
+
self.dp.convert_2_df_cols_to_dict(df, "missing_key_col", "value_col")
|
|
589
771
|
|
|
590
772
|
with self.assertRaises(ValueError):
|
|
591
|
-
self.dp.convert_2_df_cols_to_dict(df,
|
|
773
|
+
self.dp.convert_2_df_cols_to_dict(df, "key_col", "missing_value_col")
|
|
592
774
|
|
|
593
775
|
# Test with empty DataFrame
|
|
594
|
-
df_empty = pd.DataFrame(columns=[
|
|
776
|
+
df_empty = pd.DataFrame(columns=["key_col", "value_col"])
|
|
595
777
|
expected_empty_dict = {}
|
|
596
|
-
result_empty = self.dp.convert_2_df_cols_to_dict(
|
|
778
|
+
result_empty = self.dp.convert_2_df_cols_to_dict(
|
|
779
|
+
df_empty,
|
|
780
|
+
"key_col",
|
|
781
|
+
"value_col",
|
|
782
|
+
)
|
|
597
783
|
self.assertEqual(result_empty, expected_empty_dict)
|
|
598
784
|
|
|
599
785
|
def test_keyword_lookup_replacement(self):
|
|
600
786
|
# Create a test DataFrame
|
|
601
787
|
test_data = {
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
788
|
+
"col1": ["A", "B", "C", "D"],
|
|
789
|
+
"col2": ["X", "Y", "Z", "W"],
|
|
790
|
+
"value_col": ["old_value", "old_value", "unchanged", "old_value"],
|
|
605
791
|
}
|
|
606
792
|
df = pd.DataFrame(test_data)
|
|
607
793
|
|
|
608
794
|
# Lookup dictionary for replacements
|
|
609
|
-
lookup_dict = {
|
|
610
|
-
'A|X': 'new_value_1',
|
|
611
|
-
'B|Y': 'new_value_2',
|
|
612
|
-
'D|W': 'new_value_3'
|
|
613
|
-
}
|
|
795
|
+
lookup_dict = {"A|X": "new_value_1", "B|Y": "new_value_2", "D|W": "new_value_3"}
|
|
614
796
|
|
|
615
797
|
# Expected output
|
|
616
798
|
expected_output = df.copy()
|
|
617
|
-
expected_output[
|
|
799
|
+
expected_output["Updated Column"] = [
|
|
800
|
+
"new_value_1",
|
|
801
|
+
"new_value_2",
|
|
802
|
+
"unchanged",
|
|
803
|
+
"new_value_3",
|
|
804
|
+
]
|
|
618
805
|
|
|
619
806
|
# Apply the function
|
|
620
807
|
result = self.dp.keyword_lookup_replacement(
|
|
621
808
|
df.copy(),
|
|
622
|
-
col=
|
|
623
|
-
replacement_rows=
|
|
624
|
-
cols_to_merge=[
|
|
625
|
-
replacement_lookup_dict=lookup_dict
|
|
809
|
+
col="value_col",
|
|
810
|
+
replacement_rows="old_value",
|
|
811
|
+
cols_to_merge=["col1", "col2"],
|
|
812
|
+
replacement_lookup_dict=lookup_dict,
|
|
626
813
|
)
|
|
627
814
|
|
|
628
815
|
# Compare the resulting DataFrame with the expected DataFrame
|
|
629
816
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
630
817
|
|
|
631
818
|
# Test case where no replacement is needed
|
|
632
|
-
df_no_replacement = pd.DataFrame(
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
819
|
+
df_no_replacement = pd.DataFrame(
|
|
820
|
+
{
|
|
821
|
+
"col1": ["E", "F"],
|
|
822
|
+
"col2": ["G", "H"],
|
|
823
|
+
"value_col": ["unchanged", "unchanged"],
|
|
824
|
+
},
|
|
825
|
+
)
|
|
637
826
|
expected_no_replacement = df_no_replacement.copy()
|
|
638
|
-
expected_no_replacement[
|
|
827
|
+
expected_no_replacement["Updated Column"] = ["unchanged", "unchanged"]
|
|
639
828
|
|
|
640
829
|
result_no_replacement = self.dp.keyword_lookup_replacement(
|
|
641
830
|
df_no_replacement.copy(),
|
|
642
|
-
col=
|
|
643
|
-
replacement_rows=
|
|
644
|
-
cols_to_merge=[
|
|
645
|
-
replacement_lookup_dict=lookup_dict
|
|
831
|
+
col="value_col",
|
|
832
|
+
replacement_rows="old_value",
|
|
833
|
+
cols_to_merge=["col1", "col2"],
|
|
834
|
+
replacement_lookup_dict=lookup_dict,
|
|
646
835
|
)
|
|
647
836
|
|
|
648
837
|
pd.testing.assert_frame_equal(result_no_replacement, expected_no_replacement)
|
|
649
|
-
|
|
838
|
+
|
|
650
839
|
def test_convert_df_wide_2_long(self):
|
|
651
840
|
# Create a test DataFrame
|
|
652
841
|
test_data = {
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
842
|
+
"id": [1, 2, 3],
|
|
843
|
+
"name": ["Alice", "Bob", "Charlie"],
|
|
844
|
+
"score1": [85, 90, 78],
|
|
845
|
+
"score2": [88, 92, 81],
|
|
657
846
|
}
|
|
658
847
|
df = pd.DataFrame(test_data)
|
|
659
848
|
|
|
660
849
|
# Expected output for the transformation
|
|
661
|
-
expected_output = pd.DataFrame(
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
850
|
+
expected_output = pd.DataFrame(
|
|
851
|
+
{
|
|
852
|
+
"id": [1, 2, 3, 1, 2, 3],
|
|
853
|
+
"name": ["Alice", "Bob", "Charlie", "Alice", "Bob", "Charlie"],
|
|
854
|
+
"Stacked": ["score1", "score1", "score1", "score2", "score2", "score2"],
|
|
855
|
+
"Value": [85, 90, 78, 88, 92, 81],
|
|
856
|
+
},
|
|
857
|
+
)
|
|
667
858
|
|
|
668
859
|
# Apply the function
|
|
669
860
|
result = self.dp.convert_df_wide_2_long(
|
|
670
861
|
df.copy(),
|
|
671
|
-
value_cols=[
|
|
672
|
-
variable_col_name=
|
|
673
|
-
value_col_name=
|
|
862
|
+
value_cols=["score1", "score2"],
|
|
863
|
+
variable_col_name="Stacked",
|
|
864
|
+
value_col_name="Value",
|
|
674
865
|
)
|
|
675
866
|
|
|
676
867
|
# Compare the resulting DataFrame with the expected DataFrame
|
|
@@ -680,9 +871,9 @@ class TestDataProcessor(unittest.TestCase):
|
|
|
680
871
|
with self.assertRaises(ValueError):
|
|
681
872
|
self.dp.convert_df_wide_2_long(
|
|
682
873
|
df.copy(),
|
|
683
|
-
value_cols=[
|
|
684
|
-
variable_col_name=
|
|
685
|
-
value_col_name=
|
|
874
|
+
value_cols=["score1"],
|
|
875
|
+
variable_col_name="Stacked",
|
|
876
|
+
value_col_name="Value",
|
|
686
877
|
)
|
|
687
878
|
|
|
688
879
|
# Test case with no value columns (should raise ValueError)
|
|
@@ -690,24 +881,24 @@ class TestDataProcessor(unittest.TestCase):
|
|
|
690
881
|
self.dp.convert_df_wide_2_long(
|
|
691
882
|
df.copy(),
|
|
692
883
|
value_cols=[],
|
|
693
|
-
variable_col_name=
|
|
694
|
-
value_col_name=
|
|
884
|
+
variable_col_name="Stacked",
|
|
885
|
+
value_col_name="Value",
|
|
695
886
|
)
|
|
696
887
|
|
|
697
888
|
def test_format_numbers_with_commas(self):
|
|
698
889
|
# Create a test DataFrame
|
|
699
890
|
test_data = {
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
891
|
+
"col1": [1000, 2500000, 12345.678, None],
|
|
892
|
+
"col2": [2000.5, 350000.75, 0, -12345],
|
|
893
|
+
"col3": ["text", "another text", 50000, 123.45],
|
|
703
894
|
}
|
|
704
895
|
df = pd.DataFrame(test_data).fillna(value=pd.NA) # Normalize None to pd.NA
|
|
705
896
|
|
|
706
897
|
# Expected output with 2 decimal places
|
|
707
898
|
expected_data = {
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
899
|
+
"col1": ["1,000.00", "2,500,000.00", "12,345.68", pd.NA],
|
|
900
|
+
"col2": ["2,000.50", "350,000.75", "0.00", "-12,345.00"],
|
|
901
|
+
"col3": ["text", "another text", "50,000.00", "123.45"],
|
|
711
902
|
}
|
|
712
903
|
expected_output = pd.DataFrame(expected_data)
|
|
713
904
|
|
|
@@ -720,254 +911,314 @@ class TestDataProcessor(unittest.TestCase):
|
|
|
720
911
|
def test_filter_df_on_multiple_conditions(self):
|
|
721
912
|
# Create a test DataFrame
|
|
722
913
|
test_data = {
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
914
|
+
"id": [1, 2, 3, 4, 5],
|
|
915
|
+
"value": [10, 20, 30, 40, 50],
|
|
916
|
+
"category": ["A", "B", "A", "C", "A"],
|
|
917
|
+
"date": pd.to_datetime(
|
|
918
|
+
["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05"],
|
|
919
|
+
),
|
|
727
920
|
}
|
|
728
921
|
df = pd.DataFrame(test_data)
|
|
729
922
|
|
|
730
923
|
# Test Case 1: Single condition (Equality)
|
|
731
|
-
filters_dict = {
|
|
732
|
-
expected_output = df[df[
|
|
924
|
+
filters_dict = {"category": "== 'A'"}
|
|
925
|
+
expected_output = df[df["category"] == "A"]
|
|
733
926
|
result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
|
|
734
927
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
735
928
|
|
|
736
929
|
# Test Case 2: Multiple conditions (Equality and Greater Than)
|
|
737
|
-
filters_dict = {
|
|
738
|
-
expected_output = df[(df[
|
|
930
|
+
filters_dict = {"category": "== 'A'", "value": "> 20"}
|
|
931
|
+
expected_output = df[(df["category"] == "A") & (df["value"] > 20)]
|
|
739
932
|
result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
|
|
740
933
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
741
934
|
|
|
742
935
|
# Test Case 3: Date comparison
|
|
743
|
-
filters_dict = {
|
|
744
|
-
expected_output = df[df[
|
|
936
|
+
filters_dict = {"date": ">= '2023-01-03'"}
|
|
937
|
+
expected_output = df[df["date"] >= pd.to_datetime("2023-01-03")]
|
|
745
938
|
result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
|
|
746
939
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
747
940
|
|
|
748
941
|
# Test Case 4: Inequality
|
|
749
|
-
filters_dict = {
|
|
750
|
-
expected_output = df[df[
|
|
942
|
+
filters_dict = {"value": "!= 30"}
|
|
943
|
+
expected_output = df[df["value"] != 30]
|
|
751
944
|
result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
|
|
752
945
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
753
946
|
|
|
754
947
|
# Test Case 5: Mixed conditions
|
|
755
|
-
filters_dict = {
|
|
756
|
-
expected_output = df[
|
|
948
|
+
filters_dict = {"category": "== 'A'", "date": "<= '2023-01-03'"}
|
|
949
|
+
expected_output = df[
|
|
950
|
+
(df["category"] == "A") & (df["date"] <= pd.to_datetime("2023-01-03"))
|
|
951
|
+
]
|
|
757
952
|
result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
|
|
758
953
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
759
954
|
|
|
760
955
|
def test_fill_weekly_date_range(self):
|
|
761
956
|
# Test input DataFrame
|
|
762
957
|
test_data = {
|
|
763
|
-
|
|
764
|
-
|
|
958
|
+
"date": ["2023-01-02", "2023-01-16", "2023-01-30"], # Weekly data with gaps
|
|
959
|
+
"value": [10.0, 20.0, 30.0],
|
|
765
960
|
}
|
|
766
961
|
df = pd.DataFrame(test_data)
|
|
767
|
-
df[
|
|
962
|
+
df["date"] = pd.to_datetime(df["date"])
|
|
768
963
|
|
|
769
964
|
# Expected output DataFrame
|
|
770
965
|
expected_data = {
|
|
771
|
-
|
|
772
|
-
|
|
966
|
+
"date": [
|
|
967
|
+
"2023-01-02",
|
|
968
|
+
"2023-01-09",
|
|
969
|
+
"2023-01-16",
|
|
970
|
+
"2023-01-23",
|
|
971
|
+
"2023-01-30",
|
|
972
|
+
],
|
|
973
|
+
"value": [10.0, 0.0, 20.0, 0.0, 30.0],
|
|
773
974
|
}
|
|
774
975
|
expected_output = pd.DataFrame(expected_data)
|
|
775
|
-
expected_output[
|
|
976
|
+
expected_output["date"] = pd.to_datetime(expected_output["date"])
|
|
776
977
|
|
|
777
978
|
# Call the function
|
|
778
979
|
dp = dataprocessing() # Replace with the correct instantiation of your class
|
|
779
|
-
result = dp.fill_weekly_date_range(df, date_column=
|
|
980
|
+
result = dp.fill_weekly_date_range(df, date_column="date", freq="W-MON")
|
|
780
981
|
|
|
781
982
|
# Assert the result matches the expected output
|
|
782
|
-
pd.testing.assert_frame_equal(
|
|
983
|
+
pd.testing.assert_frame_equal(
|
|
984
|
+
result.reset_index(drop=True),
|
|
985
|
+
expected_output.reset_index(drop=True),
|
|
986
|
+
)
|
|
783
987
|
|
|
784
988
|
def test_add_prefix_and_suffix(self):
|
|
785
989
|
# Test DataFrame
|
|
786
990
|
test_data = {
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
991
|
+
"date": ["2023-01-01", "2023-01-02", "2023-01-03"],
|
|
992
|
+
"value1": [10, 20, 30],
|
|
993
|
+
"value2": [40, 50, 60],
|
|
790
994
|
}
|
|
791
995
|
df = pd.DataFrame(test_data)
|
|
792
996
|
|
|
793
997
|
# Expected output when no date column is excluded
|
|
794
998
|
expected_data_no_date_col = {
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
999
|
+
"prefix_date_suffix": ["2023-01-01", "2023-01-02", "2023-01-03"],
|
|
1000
|
+
"prefix_value1_suffix": [10, 20, 30],
|
|
1001
|
+
"prefix_value2_suffix": [40, 50, 60],
|
|
798
1002
|
}
|
|
799
1003
|
expected_output_no_date_col = pd.DataFrame(expected_data_no_date_col)
|
|
800
1004
|
|
|
801
1005
|
# Expected output when date column is excluded
|
|
802
1006
|
expected_data_with_date_col = {
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
1007
|
+
"date": ["2023-01-01", "2023-01-02", "2023-01-03"],
|
|
1008
|
+
"prefix_value1_suffix": [10, 20, 30],
|
|
1009
|
+
"prefix_value2_suffix": [40, 50, 60],
|
|
806
1010
|
}
|
|
807
1011
|
expected_output_with_date_col = pd.DataFrame(expected_data_with_date_col)
|
|
808
1012
|
|
|
809
1013
|
# Call the function without excluding a date column
|
|
810
1014
|
dp = dataprocessing() # Replace with the correct instantiation of your class
|
|
811
|
-
result_no_date_col = dp.add_prefix_and_suffix(
|
|
1015
|
+
result_no_date_col = dp.add_prefix_and_suffix(
|
|
1016
|
+
df.copy(),
|
|
1017
|
+
prefix="prefix_",
|
|
1018
|
+
suffix="_suffix",
|
|
1019
|
+
)
|
|
812
1020
|
|
|
813
1021
|
# Assert result matches the expected output
|
|
814
1022
|
pd.testing.assert_frame_equal(result_no_date_col, expected_output_no_date_col)
|
|
815
1023
|
|
|
816
1024
|
# Call the function with a date column excluded
|
|
817
|
-
result_with_date_col = dp.add_prefix_and_suffix(
|
|
1025
|
+
result_with_date_col = dp.add_prefix_and_suffix(
|
|
1026
|
+
df.copy(),
|
|
1027
|
+
prefix="prefix_",
|
|
1028
|
+
suffix="_suffix",
|
|
1029
|
+
date_col="date",
|
|
1030
|
+
)
|
|
818
1031
|
|
|
819
1032
|
# Assert result matches the expected output
|
|
820
|
-
pd.testing.assert_frame_equal(
|
|
1033
|
+
pd.testing.assert_frame_equal(
|
|
1034
|
+
result_with_date_col,
|
|
1035
|
+
expected_output_with_date_col,
|
|
1036
|
+
)
|
|
821
1037
|
|
|
822
1038
|
def test_create_dummies(self):
|
|
823
1039
|
# Test Case 1: Basic functionality without date column
|
|
824
|
-
df = pd.DataFrame({
|
|
825
|
-
'col1': [0, 1, 2],
|
|
826
|
-
'col2': [3, 4, 0],
|
|
827
|
-
'col3': [5, 0, 0]
|
|
828
|
-
})
|
|
1040
|
+
df = pd.DataFrame({"col1": [0, 1, 2], "col2": [3, 4, 0], "col3": [5, 0, 0]})
|
|
829
1041
|
dummy_threshold = 1
|
|
830
|
-
expected_output = pd.DataFrame(
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
'col3': [1, 0, 0]
|
|
834
|
-
})
|
|
1042
|
+
expected_output = pd.DataFrame(
|
|
1043
|
+
{"col1": [0, 0, 1], "col2": [1, 1, 0], "col3": [1, 0, 0]},
|
|
1044
|
+
)
|
|
835
1045
|
result = self.dp.create_dummies(df.copy(), dummy_threshold=dummy_threshold)
|
|
836
1046
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
837
1047
|
|
|
838
1048
|
# Test Case 2: With date column
|
|
839
|
-
df_with_date = pd.DataFrame(
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
1049
|
+
df_with_date = pd.DataFrame(
|
|
1050
|
+
{
|
|
1051
|
+
"date": ["2023-01-01", "2023-01-02", "2023-01-03"],
|
|
1052
|
+
"col1": [0, 1, 2],
|
|
1053
|
+
"col2": [3, 4, 0],
|
|
1054
|
+
},
|
|
1055
|
+
)
|
|
1056
|
+
expected_output_with_date = pd.DataFrame(
|
|
1057
|
+
{
|
|
1058
|
+
"date": ["2023-01-01", "2023-01-02", "2023-01-03"],
|
|
1059
|
+
"col1": [0, 0, 1],
|
|
1060
|
+
"col2": [1, 1, 0],
|
|
1061
|
+
},
|
|
1062
|
+
)
|
|
1063
|
+
result_with_date = self.dp.create_dummies(
|
|
1064
|
+
df_with_date.copy(),
|
|
1065
|
+
date_col="date",
|
|
1066
|
+
dummy_threshold=dummy_threshold,
|
|
1067
|
+
)
|
|
850
1068
|
pd.testing.assert_frame_equal(result_with_date, expected_output_with_date)
|
|
851
1069
|
|
|
852
1070
|
# Test Case 3: Adding total dummy column
|
|
853
1071
|
expected_output_with_total = expected_output.copy()
|
|
854
|
-
expected_output_with_total[
|
|
855
|
-
result_with_total = self.dp.create_dummies(
|
|
1072
|
+
expected_output_with_total["total"] = [1, 1, 1]
|
|
1073
|
+
result_with_total = self.dp.create_dummies(
|
|
1074
|
+
df.copy(),
|
|
1075
|
+
dummy_threshold=dummy_threshold,
|
|
1076
|
+
add_total_dummy_col="Yes",
|
|
1077
|
+
)
|
|
856
1078
|
pd.testing.assert_frame_equal(result_with_total, expected_output_with_total)
|
|
857
1079
|
|
|
858
1080
|
# Test Case 4: Adding total dummy column with date column
|
|
859
1081
|
expected_output_with_date_and_total = expected_output_with_date.copy()
|
|
860
|
-
expected_output_with_date_and_total[
|
|
1082
|
+
expected_output_with_date_and_total["total"] = [1, 1, 1]
|
|
861
1083
|
result_with_date_and_total = self.dp.create_dummies(
|
|
862
1084
|
df_with_date.copy(),
|
|
863
|
-
date_col=
|
|
1085
|
+
date_col="date",
|
|
864
1086
|
dummy_threshold=dummy_threshold,
|
|
865
|
-
add_total_dummy_col=
|
|
1087
|
+
add_total_dummy_col="Yes",
|
|
1088
|
+
)
|
|
1089
|
+
pd.testing.assert_frame_equal(
|
|
1090
|
+
result_with_date_and_total,
|
|
1091
|
+
expected_output_with_date_and_total,
|
|
866
1092
|
)
|
|
867
|
-
pd.testing.assert_frame_equal(result_with_date_and_total, expected_output_with_date_and_total)
|
|
868
1093
|
|
|
869
1094
|
# Test Case 5: Threshold of 0 (all positive numbers become 1)
|
|
870
|
-
df_threshold_0 = pd.DataFrame({
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
result_threshold_0 = self.dp.create_dummies(df_threshold_0.copy(), dummy_threshold=0)
|
|
1095
|
+
df_threshold_0 = pd.DataFrame({"col1": [-1, 0, 1], "col2": [0, 2, -3]})
|
|
1096
|
+
expected_output_threshold_0 = pd.DataFrame(
|
|
1097
|
+
{"col1": [0, 0, 1], "col2": [0, 1, 0]},
|
|
1098
|
+
)
|
|
1099
|
+
result_threshold_0 = self.dp.create_dummies(
|
|
1100
|
+
df_threshold_0.copy(),
|
|
1101
|
+
dummy_threshold=0,
|
|
1102
|
+
)
|
|
879
1103
|
pd.testing.assert_frame_equal(result_threshold_0, expected_output_threshold_0)
|
|
880
1104
|
|
|
881
1105
|
def test_replace_substrings(self):
|
|
882
1106
|
# Test Case 1: Basic replacement
|
|
883
|
-
df = pd.DataFrame(
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
replacements = {
|
|
887
|
-
expected_output = pd.DataFrame(
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
result = self.dp.replace_substrings(df.copy(),
|
|
1107
|
+
df = pd.DataFrame(
|
|
1108
|
+
{"text": ["hello world", "python programming", "hello python"]},
|
|
1109
|
+
)
|
|
1110
|
+
replacements = {"hello": "hi", "python": "java"}
|
|
1111
|
+
expected_output = pd.DataFrame(
|
|
1112
|
+
{"text": ["hi world", "java programming", "hi java"]},
|
|
1113
|
+
)
|
|
1114
|
+
result = self.dp.replace_substrings(df.copy(), "text", replacements)
|
|
891
1115
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
892
1116
|
|
|
893
1117
|
# Test Case 2: Replacement with to_lower=True
|
|
894
|
-
df_mixed_case = pd.DataFrame(
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
expected_output_lower = pd.DataFrame(
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
result_lower = self.dp.replace_substrings(
|
|
1118
|
+
df_mixed_case = pd.DataFrame(
|
|
1119
|
+
{"text": ["Hello World", "PYTHON Programming", "hello PYTHON"]},
|
|
1120
|
+
)
|
|
1121
|
+
expected_output_lower = pd.DataFrame(
|
|
1122
|
+
{"text": ["hi world", "java programming", "hi java"]},
|
|
1123
|
+
)
|
|
1124
|
+
result_lower = self.dp.replace_substrings(
|
|
1125
|
+
df_mixed_case.copy(),
|
|
1126
|
+
"text",
|
|
1127
|
+
replacements,
|
|
1128
|
+
to_lower=True,
|
|
1129
|
+
)
|
|
901
1130
|
pd.testing.assert_frame_equal(result_lower, expected_output_lower)
|
|
902
1131
|
|
|
903
1132
|
# Test Case 3: Replacement with a new column
|
|
904
|
-
df_new_col = pd.DataFrame(
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
expected_output_new_col = pd.DataFrame(
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
1133
|
+
df_new_col = pd.DataFrame(
|
|
1134
|
+
{"text": ["hello world", "python programming", "hello python"]},
|
|
1135
|
+
)
|
|
1136
|
+
expected_output_new_col = pd.DataFrame(
|
|
1137
|
+
{
|
|
1138
|
+
"text": ["hello world", "python programming", "hello python"],
|
|
1139
|
+
"new_text": ["hi world", "java programming", "hi java"],
|
|
1140
|
+
},
|
|
1141
|
+
)
|
|
1142
|
+
result_new_col = self.dp.replace_substrings(
|
|
1143
|
+
df_new_col.copy(),
|
|
1144
|
+
"text",
|
|
1145
|
+
replacements,
|
|
1146
|
+
new_column="new_text",
|
|
1147
|
+
)
|
|
912
1148
|
pd.testing.assert_frame_equal(result_new_col, expected_output_new_col)
|
|
913
1149
|
|
|
914
1150
|
def test_add_total_column(self):
|
|
915
1151
|
# Test Case 1: Basic functionality without excluding any column
|
|
916
|
-
df = pd.DataFrame({
|
|
917
|
-
'col1': [1, 2, 3],
|
|
918
|
-
'col2': [4, 5, 6],
|
|
919
|
-
'col3': [7, 8, 9]
|
|
920
|
-
})
|
|
1152
|
+
df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9]})
|
|
921
1153
|
expected_output = df.copy()
|
|
922
|
-
expected_output[
|
|
1154
|
+
expected_output["Total"] = [12, 15, 18]
|
|
923
1155
|
result = self.dp.add_total_column(df.copy())
|
|
924
1156
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
925
1157
|
|
|
926
1158
|
# Test Case 2: Excluding a column from the total
|
|
927
|
-
df = pd.DataFrame({
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
})
|
|
938
|
-
result_exclude = self.dp.add_total_column(df.copy(), exclude_col='col3')
|
|
1159
|
+
df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9]})
|
|
1160
|
+
expected_output_exclude = pd.DataFrame(
|
|
1161
|
+
{
|
|
1162
|
+
"col1": [1, 2, 3],
|
|
1163
|
+
"col2": [4, 5, 6],
|
|
1164
|
+
"col3": [7, 8, 9],
|
|
1165
|
+
"Total": [5, 7, 9], # Sum without 'col3'
|
|
1166
|
+
},
|
|
1167
|
+
)
|
|
1168
|
+
result_exclude = self.dp.add_total_column(df.copy(), exclude_col="col3")
|
|
939
1169
|
pd.testing.assert_frame_equal(result_exclude, expected_output_exclude)
|
|
940
1170
|
|
|
941
1171
|
# Test Case 3: Custom total column name
|
|
942
|
-
custom_total_col_name =
|
|
1172
|
+
custom_total_col_name = "Sum"
|
|
943
1173
|
expected_output_custom = df.copy()
|
|
944
1174
|
expected_output_custom[custom_total_col_name] = [12, 15, 18]
|
|
945
|
-
result_custom = self.dp.add_total_column(
|
|
1175
|
+
result_custom = self.dp.add_total_column(
|
|
1176
|
+
df.copy(),
|
|
1177
|
+
total_col_name=custom_total_col_name,
|
|
1178
|
+
)
|
|
946
1179
|
pd.testing.assert_frame_equal(result_custom, expected_output_custom)
|
|
947
1180
|
|
|
948
1181
|
# Test Case 4: DataFrame with a single column
|
|
949
|
-
single_col_df = pd.DataFrame({
|
|
1182
|
+
single_col_df = pd.DataFrame({"col1": [1, 2, 3]})
|
|
950
1183
|
expected_single_col = single_col_df.copy()
|
|
951
|
-
expected_single_col[
|
|
1184
|
+
expected_single_col["Total"] = [1, 2, 3]
|
|
952
1185
|
result_single_col = self.dp.add_total_column(single_col_df.copy())
|
|
953
1186
|
pd.testing.assert_frame_equal(result_single_col, expected_single_col)
|
|
954
1187
|
|
|
955
1188
|
def test_apply_lookup_table_based_on_substring(self):
|
|
956
1189
|
# Test Case 1: Basic categorization
|
|
957
|
-
df = pd.DataFrame(
|
|
958
|
-
|
|
959
|
-
|
|
1190
|
+
df = pd.DataFrame(
|
|
1191
|
+
{
|
|
1192
|
+
"text": [
|
|
1193
|
+
"I love apples",
|
|
1194
|
+
"Bananas are great",
|
|
1195
|
+
"Something else",
|
|
1196
|
+
"Grapes are sour",
|
|
1197
|
+
],
|
|
1198
|
+
},
|
|
1199
|
+
)
|
|
960
1200
|
category_dict = {
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
1201
|
+
"apple": "Fruit",
|
|
1202
|
+
"banana": "Fruit",
|
|
1203
|
+
"cherry": "Fruit",
|
|
1204
|
+
"grape": "Fruit",
|
|
965
1205
|
}
|
|
966
|
-
expected_output = pd.DataFrame(
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
1206
|
+
expected_output = pd.DataFrame(
|
|
1207
|
+
{
|
|
1208
|
+
"text": [
|
|
1209
|
+
"I love apples",
|
|
1210
|
+
"Bananas are great",
|
|
1211
|
+
"Something else",
|
|
1212
|
+
"Grapes are sour",
|
|
1213
|
+
],
|
|
1214
|
+
"Category": ["Fruit", "Fruit", "Other", "Fruit"],
|
|
1215
|
+
},
|
|
1216
|
+
)
|
|
1217
|
+
result = self.dp.apply_lookup_table_based_on_substring(
|
|
1218
|
+
df.copy(),
|
|
1219
|
+
"text",
|
|
1220
|
+
category_dict,
|
|
1221
|
+
)
|
|
971
1222
|
pd.testing.assert_frame_equal(result, expected_output)
|
|
972
1223
|
|
|
973
1224
|
def test_compare_overlap(self):
|
|
@@ -993,29 +1244,30 @@ class TestDataProcessor(unittest.TestCase):
|
|
|
993
1244
|
df2 = pd.DataFrame(df2_data)
|
|
994
1245
|
|
|
995
1246
|
# 3. Call compare_overlap from your dataprocessing class
|
|
996
|
-
diff_df, total_diff_df = self.dp.compare_overlap(df1, df2,
|
|
997
|
-
expected_diff_df = pd.DataFrame(
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1247
|
+
diff_df, total_diff_df = self.dp.compare_overlap(df1, df2, "date")
|
|
1248
|
+
expected_diff_df = pd.DataFrame(
|
|
1249
|
+
{
|
|
1250
|
+
"date": pd.to_datetime(["2021-01-03", "2021-01-04"]),
|
|
1251
|
+
"diff_value": [-2, 5],
|
|
1252
|
+
"diff_count": [1, -1],
|
|
1253
|
+
},
|
|
1254
|
+
)
|
|
1255
|
+
|
|
1256
|
+
expected_total_diff_df = pd.DataFrame(
|
|
1257
|
+
{"Column": ["value", "count"], "Total Difference": [3, 0]},
|
|
1258
|
+
)
|
|
1007
1259
|
|
|
1008
1260
|
# 5. Use pd.testing.assert_frame_equal to check the outputs
|
|
1009
1261
|
# Sort and reset index to ensure matching row order
|
|
1010
1262
|
pd.testing.assert_frame_equal(
|
|
1011
|
-
diff_df.sort_values(
|
|
1012
|
-
expected_diff_df.sort_values(
|
|
1263
|
+
diff_df.sort_values("date").reset_index(drop=True),
|
|
1264
|
+
expected_diff_df.sort_values("date").reset_index(drop=True),
|
|
1013
1265
|
)
|
|
1014
1266
|
|
|
1015
1267
|
# Sort by 'Column' to ensure matching row order in summary
|
|
1016
1268
|
pd.testing.assert_frame_equal(
|
|
1017
|
-
total_diff_df.sort_values(
|
|
1018
|
-
expected_total_diff_df.sort_values(
|
|
1269
|
+
total_diff_df.sort_values("Column").reset_index(drop=True),
|
|
1270
|
+
expected_total_diff_df.sort_values("Column").reset_index(drop=True),
|
|
1019
1271
|
)
|
|
1020
1272
|
|
|
1021
1273
|
def test_week_commencing_2_week_commencing_conversion_isoweekday(self):
|
|
@@ -1035,29 +1287,28 @@ class TestDataProcessor(unittest.TestCase):
|
|
|
1035
1287
|
pd.Timestamp("2023-01-02"), # Friday
|
|
1036
1288
|
pd.Timestamp("2023-01-02"), # Saturday
|
|
1037
1289
|
],
|
|
1038
|
-
name="week_start_mon"
|
|
1290
|
+
name="week_start_mon",
|
|
1039
1291
|
)
|
|
1040
1292
|
|
|
1041
1293
|
# Use the new function from our data processing object
|
|
1042
1294
|
result = self.dp.week_commencing_2_week_commencing_conversion_isoweekday(
|
|
1043
1295
|
df.copy(),
|
|
1044
1296
|
date_col="date",
|
|
1045
|
-
week_commencing="mon"
|
|
1297
|
+
week_commencing="mon",
|
|
1046
1298
|
)
|
|
1047
1299
|
|
|
1048
1300
|
# Compare the 'week_start_mon' column with our expected results
|
|
1049
1301
|
pd.testing.assert_series_equal(
|
|
1050
1302
|
result["week_start_mon"], # actual
|
|
1051
|
-
expected_mon
|
|
1303
|
+
expected_mon, # expected
|
|
1052
1304
|
)
|
|
1053
1305
|
|
|
1054
1306
|
|
|
1055
|
-
|
|
1056
1307
|
###################################################################################################################################################
|
|
1057
1308
|
###################################################################################################################################################
|
|
1058
1309
|
|
|
1059
1310
|
# class TestDataPull(unittest.TestCase)
|
|
1060
1311
|
|
|
1061
1312
|
|
|
1062
|
-
if __name__ ==
|
|
1313
|
+
if __name__ == "__main__":
|
|
1063
1314
|
unittest.main()
|