imsciences 0.6.3.0__py3-none-any.whl → 0.6.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1335 @@
1
+ import unittest
2
+ import pandas as pd
3
+ import numpy as np
4
+ import os
5
+ from datafunctions import dataprocessing
6
+ import plotly.graph_objects as go
7
+
8
+ class TestDataProcessor(unittest.TestCase):
9
+
10
+ def setUp(self):
11
+ self.dp = dataprocessing()
12
+ self.df = pd.DataFrame({
13
+ 'date': pd.date_range(start='2023-01-01', periods=10, freq='D'),
14
+ 'value1': range(10),
15
+ 'value2': range(10, 20)
16
+ })
17
+ self.mixed_date_df = pd.DataFrame({
18
+ 'mixed_date': ['2023-01-01', '01/02/2023', '2023/03/01', '2023-04-01']
19
+ })
20
+ self.merged_df = pd.DataFrame({
21
+ 'col1': ["A", "B", "C"],
22
+ 'col2': ["X", "Y", "Z"]
23
+ })
24
+
25
+ def test_get_wd_levels(self):
26
+ current_dir = os.getcwd()
27
+ parent_dir = self.dp.get_wd_levels(1)
28
+ self.assertEqual(parent_dir, os.path.dirname(current_dir))
29
+
30
+ def test_remove_rows(self):
31
+ df = self.dp.remove_rows(self.df, 1)
32
+ self.assertEqual(len(df), 9)
33
+ self.assertEqual(df.columns.tolist(), [pd.to_datetime('2023-01-01 00:00:00'), np.int64(0), np.int64(10)])
34
+
35
+ def test_aggregate_daily_to_wc_long(self):
36
+ # Create a test DataFrame
37
+ test_data = {
38
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-08', '2023-01-09', '2023-01-10'],
39
+ 'group_col': ['A', 'A', 'B', 'B', 'B'],
40
+ 'value1': [10, 20, 30, 40, np.nan],
41
+ 'value2': [100, 200, 300, np.nan, 500]
42
+ }
43
+ df = pd.DataFrame(test_data)
44
+
45
+ # Expected output for different test cases
46
+ expected_sum_output = pd.DataFrame({
47
+ 'OBS': ['2023-01-01', '2023-01-08'], # Week starting on Sunday
48
+ 'group_col': ['A', 'B'],
49
+ 'value1': [30.0, 70.0],
50
+ 'value2': [300.0, 800.0]
51
+ })
52
+
53
+ # Convert OBS column to datetime for expected DataFrame
54
+ expected_sum_output['OBS'] = pd.to_datetime(expected_sum_output['OBS'])
55
+
56
+ # Test sum aggregation
57
+ result_sum = self.dp.aggregate_daily_to_wc_long(df, 'date', ['group_col'], ['value1', 'value2'], wc='sun', aggregation='sum')
58
+
59
+ # Ensure both OBS columns are datetime for comparison
60
+ result_sum['OBS'] = pd.to_datetime(result_sum['OBS'])
61
+
62
+ # Compare the resulting DataFrame with the expected DataFrame
63
+ pd.testing.assert_frame_equal(result_sum, expected_sum_output)
64
+
65
+ def test_convert_monthly_to_daily(self):
66
+ # Create a test DataFrame with monthly data
67
+ test_data = {
68
+ 'date': ['2023-01-01', '2023-02-01', '2023-03-01'],
69
+ 'value1': [31, 28, 31],
70
+ 'value2': [310, 280, 310]
71
+ }
72
+ df = pd.DataFrame(test_data)
73
+
74
+ # Expected output DataFrame when divide=True
75
+ expected_daily_data_divide = {
76
+ 'date': pd.date_range(start='2023-01-01', end='2023-01-31').tolist() +
77
+ pd.date_range(start='2023-02-01', end='2023-02-28').tolist() +
78
+ pd.date_range(start='2023-03-01', end='2023-03-31').tolist(),
79
+ 'value1': [1.0] * 31 + [1.0] * 28 + [1.0] * 31,
80
+ 'value2': [10.0] * 31 + [10.0] * 28 + [10.0] * 31
81
+ }
82
+ expected_daily_df_divide = pd.DataFrame(expected_daily_data_divide)
83
+
84
+ # Call the function with divide=True
85
+ result_divide = self.dp.convert_monthly_to_daily(df, 'date', divide=True)
86
+
87
+ # Compare the resulting DataFrame with the expected DataFrame
88
+ pd.testing.assert_frame_equal(result_divide.reset_index(drop=True), expected_daily_df_divide)
89
+
90
+ # Expected output DataFrame when divide=False
91
+ expected_daily_data_no_divide = {
92
+ 'date': pd.date_range(start='2023-01-01', end='2023-01-31').tolist() +
93
+ pd.date_range(start='2023-02-01', end='2023-02-28').tolist() +
94
+ pd.date_range(start='2023-03-01', end='2023-03-31').tolist(),
95
+ 'value1': [31] * 31 + [28] * 28 + [31] * 31,
96
+ 'value2': [310] * 31 + [280] * 28 + [310] * 31
97
+ }
98
+ expected_daily_df_no_divide = pd.DataFrame(expected_daily_data_no_divide)
99
+
100
+ # Call the function with divide=False
101
+ result_no_divide = self.dp.convert_monthly_to_daily(df, 'date', divide=False)
102
+
103
+ # Compare the resulting DataFrame with the expected DataFrame
104
+ pd.testing.assert_frame_equal(result_no_divide.reset_index(drop=True), expected_daily_df_no_divide)
105
+
106
+ def test_plot_two(self):
107
+ # Test DataFrames
108
+ df1 = pd.DataFrame({
109
+ 'date': pd.date_range(start='2023-01-01', periods=5, freq='D'),
110
+ 'value1': [10, 20, 30, 40, 50]
111
+ })
112
+ df2 = pd.DataFrame({
113
+ 'date': pd.date_range(start='2023-01-03', periods=5, freq='D'),
114
+ 'value2': [5, 15, 25, 35, 45]
115
+ })
116
+
117
+ # Test case: Plot on the same axis
118
+ fig_same_axis = self.dp.plot_two(df1, 'value1', df2, 'value2', 'date', same_axis=True)
119
+
120
+ # Assertions for same-axis plot
121
+ self.assertEqual(len(fig_same_axis.data), 2) # Ensure two traces are present
122
+ self.assertEqual(fig_same_axis.data[0].name, 'value1') # Check trace name for df1
123
+ self.assertEqual(fig_same_axis.data[1].name, 'value2') # Check trace name for df2
124
+ self.assertEqual(fig_same_axis.data[0].yaxis, 'y') # Check y-axis for trace1 (default is 'y')
125
+ self.assertEqual(fig_same_axis.data[1].yaxis, 'y') # Check y-axis for trace2 (same axis)
126
+
127
+ # Test case: Plot on separate axes
128
+ fig_separate_axis = self.dp.plot_two(df1, 'value1', df2, 'value2', 'date', same_axis=False)
129
+
130
+ # Assertions for separate-axis plot
131
+ self.assertEqual(len(fig_separate_axis.data), 2) # Ensure two traces are present
132
+ self.assertEqual(fig_separate_axis.data[0].yaxis, 'y') # Check y-axis for trace1 (default is 'y')
133
+ self.assertEqual(fig_separate_axis.data[1].yaxis, 'y2') # Check y-axis for trace2 (separate axis)
134
+
135
+ # Check layout properties
136
+ self.assertTrue('yaxis' in fig_separate_axis.layout)
137
+ self.assertTrue('yaxis2' in fig_separate_axis.layout)
138
+ self.assertEqual(fig_separate_axis.layout.yaxis.overlaying, None) # y1 is not overlaying
139
+ self.assertEqual(fig_separate_axis.layout.yaxis2.overlaying, 'y') # y2 overlays y1
140
+
141
+ def test_remove_nan_rows(self):
142
+ # Create a test DataFrame
143
+ test_data = {
144
+ 'col1': [1.0, 2.0, np.nan, 4.0, 5.0],
145
+ 'col2': ['A', 'B', 'C', 'D', np.nan],
146
+ }
147
+ df = pd.DataFrame(test_data)
148
+
149
+ # Expected output when removing rows with NaN in 'col1'
150
+ expected_output_col1 = pd.DataFrame({
151
+ 'col1': [1.0, 2.0, 4.0, 5.0],
152
+ 'col2': ['A', 'B', 'D', np.nan],
153
+ })
154
+
155
+ # Expected output when removing rows with NaN in 'col2'
156
+ expected_output_col2 = pd.DataFrame({
157
+ 'col1': [1.0, 2.0, np.nan, 4.0],
158
+ 'col2': ['A', 'B', 'C', 'D'],
159
+ })
160
+
161
+ # Test removing NaN rows based on 'col1'
162
+ result_col1 = self.dp.remove_nan_rows(df, 'col1')
163
+ pd.testing.assert_frame_equal(result_col1.reset_index(drop=True), expected_output_col1)
164
+
165
+ # Test removing NaN rows based on 'col2'
166
+ result_col2 = self.dp.remove_nan_rows(df, 'col2')
167
+ pd.testing.assert_frame_equal(result_col2.reset_index(drop=True), expected_output_col2)
168
+
169
+ def test_filter_rows(self):
170
+ # Create a test DataFrame
171
+ test_data = {
172
+ 'col1': [1, 2, 3, 4, 5],
173
+ 'col2': ['A', 'B', 'C', 'D', 'E'],
174
+ }
175
+ df = pd.DataFrame(test_data)
176
+
177
+ # Expected output when filtering 'col1' for [2, 4]
178
+ expected_output_col1 = pd.DataFrame({
179
+ 'col1': [2, 4],
180
+ 'col2': ['B', 'D'],
181
+ })
182
+
183
+ # Expected output when filtering 'col2' for ['A', 'C', 'E']
184
+ expected_output_col2 = pd.DataFrame({
185
+ 'col1': [1, 3, 5],
186
+ 'col2': ['A', 'C', 'E'],
187
+ })
188
+
189
+ # Test filtering rows based on 'col1'
190
+ result_col1 = self.dp.filter_rows(df, 'col1', [2, 4])
191
+ pd.testing.assert_frame_equal(result_col1.reset_index(drop=True), expected_output_col1)
192
+
193
+ # Test filtering rows based on 'col2'
194
+ result_col2 = self.dp.filter_rows(df, 'col2', ['A', 'C', 'E'])
195
+ pd.testing.assert_frame_equal(result_col2.reset_index(drop=True), expected_output_col2)
196
+
197
+ def test_plot_one(self):
198
+ # Create a test DataFrame
199
+ df = pd.DataFrame({
200
+ 'date': pd.date_range(start='2023-01-01', periods=5, freq='D'),
201
+ 'value1': [10, 20, 30, 40, 50]
202
+ })
203
+
204
+ # Test case: Valid input
205
+ fig = self.dp.plot_one(df, 'value1', 'date')
206
+
207
+ # Assertions for the plot
208
+ self.assertEqual(len(fig.data), 1) # Ensure one trace is present
209
+ self.assertEqual(fig.data[0].x.tolist(), df['date'].tolist()) # Check X-axis data matches date column
210
+ self.assertEqual(fig.data[0].y.tolist(), df['value1'].tolist()) # Check Y-axis data matches value1 column
211
+ self.assertEqual(fig.layout.plot_bgcolor, 'white') # Check background color
212
+ self.assertEqual(fig.layout.xaxis.linecolor, 'black') # Check X-axis line color
213
+ self.assertEqual(fig.layout.yaxis.linecolor, 'black') # Check Y-axis line color
214
+
215
+ # Test case: Invalid column name
216
+ with self.assertRaises(ValueError) as context:
217
+ self.dp.plot_one(df, 'nonexistent_column', 'date')
218
+ self.assertIn("Column not found in DataFrame", str(context.exception))
219
+
220
+ # Test case: Invalid date column
221
+ with self.assertRaises(ValueError) as context:
222
+ self.dp.plot_one(df, 'value1', 'nonexistent_date_column')
223
+ self.assertIn("Column not found in DataFrame", str(context.exception))
224
+
225
+ # Test case: Non-datetime date column
226
+ df_non_datetime = pd.DataFrame({
227
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
228
+ 'value1': [10, 20, 30]
229
+ })
230
+ fig_non_datetime = self.dp.plot_one(df_non_datetime, 'value1', 'date')
231
+ self.assertTrue(pd.api.types.is_datetime64_any_dtype(df_non_datetime['date'])) # Ensure conversion to datetime
232
+
233
+ def test_week_of_year_mapping(self):
234
+ # Create a test DataFrame with ISO week format
235
+ test_data = {
236
+ 'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52']
237
+ }
238
+ df = pd.DataFrame(test_data)
239
+
240
+ # Expected outputs for different start days
241
+ expected_output_mon = pd.DataFrame({
242
+ 'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52'],
243
+ 'OBS': ['02/01/2023', '30/01/2023', '06/03/2023', '25/12/2023']
244
+ })
245
+
246
+ expected_output_sun = pd.DataFrame({
247
+ 'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52'],
248
+ 'OBS': ['01/01/2023', '29/01/2023', '05/03/2023', '24/12/2023']
249
+ })
250
+
251
+ # Test mapping with Monday as start day
252
+ result_mon = self.dp.week_of_year_mapping(df.copy(), 'week_col', 'mon')
253
+ pd.testing.assert_frame_equal(result_mon, expected_output_mon)
254
+
255
+ # Test mapping with Sunday as start day
256
+ result_sun = self.dp.week_of_year_mapping(df.copy(), 'week_col', 'sun')
257
+ pd.testing.assert_frame_equal(result_sun, expected_output_sun)
258
+
259
+ # Test with invalid start day input
260
+ with self.assertRaises(ValueError) as context:
261
+ self.dp.week_of_year_mapping(df.copy(), 'week_col', 'invalid_day')
262
+ self.assertIn("Invalid day input", str(context.exception))
263
+
264
+ def test_exclude_rows(self):
265
+ # Create a test DataFrame
266
+ test_data = {
267
+ 'col1': [1, 2, 3, 4, 5],
268
+ 'col2': ['A', 'B', 'C', 'D', 'E'],
269
+ }
270
+ df = pd.DataFrame(test_data)
271
+
272
+ # Expected output when excluding rows with 'col1' values [2, 4]
273
+ expected_output_col1 = pd.DataFrame({
274
+ 'col1': [1, 3, 5],
275
+ 'col2': ['A', 'C', 'E'],
276
+ })
277
+
278
+ # Expected output when excluding rows with 'col2' values ['A', 'C', 'E']
279
+ expected_output_col2 = pd.DataFrame({
280
+ 'col1': [2, 4],
281
+ 'col2': ['B', 'D'],
282
+ })
283
+
284
+ # Test excluding rows based on 'col1'
285
+ result_col1 = self.dp.exclude_rows(df, 'col1', [2, 4])
286
+ pd.testing.assert_frame_equal(result_col1.reset_index(drop=True), expected_output_col1)
287
+
288
+ # Test excluding rows based on 'col2'
289
+ result_col2 = self.dp.exclude_rows(df, 'col2', ['A', 'C', 'E'])
290
+ pd.testing.assert_frame_equal(result_col2.reset_index(drop=True), expected_output_col2)
291
+
292
+ def test_rename_cols(self):
293
+ # Create a test DataFrame
294
+ test_data = {
295
+ 'OBS': [1, 2, 3],
296
+ 'Column One': [10, 20, 30],
297
+ 'Another Column': [100, 200, 300],
298
+ 'Special Characters !@#': [5, 15, 25]
299
+ }
300
+ df = pd.DataFrame(test_data)
301
+
302
+ # Expected output with default prefix
303
+ expected_output_default = pd.DataFrame({
304
+ 'OBS': [1, 2, 3],
305
+ 'ame_column_one': [10, 20, 30],
306
+ 'ame_another_column': [100, 200, 300],
307
+ 'ame_special_characters_!@#': [5, 15, 25]
308
+ })
309
+
310
+ # Expected output with custom prefix
311
+ expected_output_custom = pd.DataFrame({
312
+ 'OBS': [1, 2, 3],
313
+ 'custom_column_one': [10, 20, 30],
314
+ 'custom_another_column': [100, 200, 300],
315
+ 'custom_special_characters_!@#': [5, 15, 25]
316
+ })
317
+
318
+ # Test renaming columns with default prefix
319
+ result_default = self.dp.rename_cols(df)
320
+ pd.testing.assert_frame_equal(result_default, expected_output_default)
321
+
322
+ # Test renaming columns with custom prefix
323
+ result_custom = self.dp.rename_cols(df, name='custom_')
324
+ pd.testing.assert_frame_equal(result_custom, expected_output_custom)
325
+
326
+ # Test that 'OBS' column remains unchanged
327
+ self.assertIn('OBS', result_default.columns)
328
+ self.assertIn('OBS', result_custom.columns)
329
+
330
+ def test_merge_new_and_old(self):
331
+ # Create test DataFrames for old and new data
332
+ old_data = {
333
+ 'OBS': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
334
+ 'old_values': [10, 20, 30, 40]
335
+ }
336
+ new_data = {
337
+ 'OBS': ['2023-01-04', '2023-01-05', '2023-01-06'],
338
+ 'new_values': [100, 200, 300]
339
+ }
340
+ old_df = pd.DataFrame(old_data)
341
+ new_df = pd.DataFrame(new_data)
342
+
343
+ # Expected output
344
+ expected_output = pd.DataFrame({
345
+ 'OBS': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),
346
+ 'new_values': [10, 20, 30, 40, 200, 300]
347
+ })
348
+
349
+ # Test merging with cutoff_date='2023-01-04'
350
+ result = self.dp.merge_new_and_old(old_df, 'old_values', new_df, 'new_values', '2023-01-04')
351
+
352
+ # Assertions
353
+ pd.testing.assert_frame_equal(result, expected_output)
354
+
355
+ # Test that columns are correctly renamed and sorted
356
+ self.assertIn('OBS', result.columns)
357
+ self.assertIn('new_values', result.columns)
358
+ self.assertEqual(len(result), len(expected_output)) # Ensure row count matches
359
+ self.assertTrue((result['OBS'].diff().dropna() >= pd.Timedelta(0)).all()) # Check that dates are in order
360
+
361
+ def test_merge_dataframes_on_column(self):
362
+ # Create test DataFrames
363
+ df1 = pd.DataFrame({
364
+ 'OBS': ['2023-01-01', '2023-01-02', '2023-01-03'],
365
+ 'value1': [10, 20, 30]
366
+ })
367
+ df2 = pd.DataFrame({
368
+ 'OBS': ['2023-01-02', '2023-01-03', '2023-01-04'],
369
+ 'value2': [40, 50, 60]
370
+ })
371
+ df3 = pd.DataFrame({
372
+ 'OBS': ['2023-01-03', '2023-01-04', '2023-01-05'],
373
+ 'value3': [70, 80, 90]
374
+ })
375
+
376
+ # Ensure test DataFrame columns are datetime
377
+ df1['OBS'] = pd.to_datetime(df1['OBS'])
378
+ df2['OBS'] = pd.to_datetime(df2['OBS'])
379
+ df3['OBS'] = pd.to_datetime(df3['OBS'])
380
+
381
+ # Expected output for outer merge (cast to float64 to match the behavior of fillna)
382
+ expected_output_outer = pd.DataFrame({
383
+ 'OBS': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']),
384
+ 'value1': [10.0, 20.0, 30.0, 0.0, 0.0],
385
+ 'value2': [0.0, 40.0, 50.0, 60.0, 0.0],
386
+ 'value3': [0.0, 0.0, 70.0, 80.0, 90.0]
387
+ })
388
+
389
+ # Expected output for inner merge
390
+ expected_output_inner = pd.DataFrame({
391
+ 'OBS': pd.to_datetime(['2023-01-03']),
392
+ 'value1': [30],
393
+ 'value2': [50],
394
+ 'value3': [70]
395
+ })
396
+
397
+ # Test outer merge
398
+ result_outer = self.dp.merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')
399
+ pd.testing.assert_frame_equal(result_outer.reset_index(drop=True), expected_output_outer)
400
+
401
+ # Test inner merge
402
+ result_inner = self.dp.merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='inner')
403
+ pd.testing.assert_frame_equal(result_inner.reset_index(drop=True), expected_output_inner)
404
+
405
+ # Test with empty DataFrame list
406
+ result_empty = self.dp.merge_dataframes_on_column([], common_column='OBS', merge_how='outer')
407
+ self.assertIsNone(result_empty)
408
+
409
+ # Test with one DataFrame in the list
410
+ result_single = self.dp.merge_dataframes_on_column([df1], common_column='OBS', merge_how='outer')
411
+ pd.testing.assert_frame_equal(result_single.reset_index(drop=True), df1)
412
+
413
+ # Test that the common column is sorted and converted to datetime
414
+ self.assertTrue(pd.api.types.is_datetime64_any_dtype(result_outer['OBS']))
415
+ self.assertTrue((result_outer['OBS'].diff().dropna() >= pd.Timedelta(0)).all()) # Check sorted dates
416
+
417
+ def test_merge_and_update_dfs(self):
418
+ # Create test DataFrames
419
+ df1 = pd.DataFrame({
420
+ 'OBS': ['2023-01-01', '2023-01-02', '2023-01-03'],
421
+ 'value1': [10, 20, 30],
422
+ 'value2': [100, 200, 300]
423
+ })
424
+
425
+ df2 = pd.DataFrame({
426
+ 'OBS': ['2023-01-02', '2023-01-03', '2023-01-04'],
427
+ 'value1': [15, 25, 35], # Updates for value1
428
+ 'value3': [400, 500, 600] # New column
429
+ })
430
+
431
+ # Ensure test DataFrame columns are datetime
432
+ df1['OBS'] = pd.to_datetime(df1['OBS'])
433
+ df2['OBS'] = pd.to_datetime(df2['OBS'])
434
+
435
+ # Expected output with float64 for numeric columns
436
+ expected_output = pd.DataFrame({
437
+ 'OBS': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04']),
438
+ 'value1': [10.0, 15.0, 25.0, 35.0], # Updated where applicable
439
+ 'value2': [100.0, 200.0, 300.0, 0.0], # From df1, 0 where not available
440
+ 'value3': [0.0, 400.0, 500.0, 600.0] # From df2, 0 where not available
441
+ })
442
+
443
+ # Test the merge and update function
444
+ result = self.dp.merge_and_update_dfs(df1, df2, key_column='OBS')
445
+
446
+ # Assertions
447
+ pd.testing.assert_frame_equal(result.reset_index(drop=True), expected_output)
448
+
449
+ # Test column order is preserved in the result
450
+ self.assertListEqual(list(result.columns), list(expected_output.columns))
451
+
452
+ # Test that the OBS column is sorted
453
+ self.assertTrue((result['OBS'].diff().dropna() >= pd.Timedelta(0)).all())
454
+
455
+ def test_convert_us_to_uk_dates(self):
456
+ # Create a test DataFrame
457
+ test_data = {
458
+ 'date_col': ['01-02-2023', '03/04/2023', '05-06-2023', '07/08/2023']
459
+ }
460
+ df = pd.DataFrame(test_data)
461
+
462
+ # Expected output
463
+ expected_output = pd.DataFrame({
464
+ 'date_col': pd.to_datetime(['2023-01-02', '2023-03-04', '2023-05-06', '2023-07-08'])
465
+ })
466
+
467
+ # Test the conversion function
468
+ result = self.dp.convert_us_to_uk_dates(df.copy(), 'date_col')
469
+
470
+ # Assertions
471
+ pd.testing.assert_frame_equal(result, expected_output)
472
+
473
+ # Test invalid input formats
474
+ invalid_data = pd.DataFrame({'date_col': ['invalid-date', '12345']})
475
+ with self.assertRaises(ValueError):
476
+ self.dp.convert_us_to_uk_dates(invalid_data.copy(), 'date_col')
477
+
478
+ # Test missing values
479
+ missing_data = pd.DataFrame({'date_col': [None, '03/04/2023']})
480
+ result_with_missing = self.dp.convert_us_to_uk_dates(missing_data.copy(), 'date_col')
481
+ expected_with_missing = pd.DataFrame({
482
+ 'date_col': [pd.NaT, pd.to_datetime('2023-03-04')]
483
+ })
484
+ pd.testing.assert_frame_equal(result_with_missing, expected_with_missing)
485
+
486
+ def test_pivot_table(self):
487
+ # Create a test DataFrame
488
+ test_data = {
489
+ 'date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-03'],
490
+ 'category': ['A', 'B', 'A', 'B', 'A'],
491
+ 'value': [10.0, 20.0, 30.0, 40.0, 50.0]
492
+ }
493
+ df = pd.DataFrame(test_data)
494
+
495
+ # Ensure the 'date' column is in datetime format
496
+ df['date'] = pd.to_datetime(df['date'])
497
+
498
+ # Expected output for basic pivot table
499
+ expected_output_basic = pd.DataFrame({
500
+ 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03']),
501
+ 'A': [10.0, 30.0, 50.0], # Cast to float64
502
+ 'B': [20.0, 40.0, 0.0] # Cast to float64
503
+ })
504
+ expected_output_basic.columns.name = 'category'
505
+
506
+ # Test basic pivot table
507
+ result_basic = self.dp.pivot_table(df.copy(), index_col='date', columns='category', values_col='value', margins=False, fill_value=0)
508
+
509
+ # Convert 'date' columns in both DataFrames to datetime for comparison
510
+ result_basic['date'] = pd.to_datetime(result_basic['date'])
511
+ expected_output_basic['date'] = pd.to_datetime(expected_output_basic['date'])
512
+ pd.testing.assert_frame_equal(result_basic, expected_output_basic)
513
+
514
+ # Expected output for pivot table with margins
515
+ expected_output_with_margins = pd.DataFrame({
516
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03', 'Total'],
517
+ 'A': [10.0, 30.0, 50.0, 90.0],
518
+ 'B': [20.0, 40.0, 0.0, 60.0],
519
+ 'Total': [30.0, 70.0, 50.0, 150.0]
520
+ })
521
+ expected_output_with_margins['date'] = pd.to_datetime(
522
+ expected_output_with_margins['date'], errors='coerce'
523
+ ).fillna('Total')
524
+ expected_output_with_margins.columns.name = 'category'
525
+
526
+ # Test pivot table with margins
527
+ result_with_margins = self.dp.pivot_table(df.copy(), index_col='date', columns='category', values_col='value', margins=True, fill_value=0)
528
+ result_with_margins['date'] = pd.to_datetime(result_with_margins['date'], errors='coerce').fillna('Total')
529
+ pd.testing.assert_frame_equal(result_with_margins, expected_output_with_margins)
530
+
531
+ def test_apply_lookup_table_for_columns(self):
532
+ # Create a test DataFrame
533
+ test_data = {
534
+ 'col1': ['apple', 'banana', 'carrot', 'date', 'eggplant'],
535
+ 'col2': ['fruit', 'fruit', 'vegetable', 'fruit', 'vegetable']
536
+ }
537
+ df = pd.DataFrame(test_data)
538
+
539
+ # Lookup dictionary
540
+ lookup_dict = {
541
+ 'apple': 'Red Fruit',
542
+ 'banana': 'Yellow Fruit',
543
+ 'carrot': 'Orange Vegetable',
544
+ 'date': 'Brown Fruit'
545
+ }
546
+
547
+ # Expected output with single column lookup
548
+ expected_output_single = df.copy()
549
+ expected_output_single['Mapping'] = ['Red Fruit', 'Yellow Fruit', 'Orange Vegetable', 'Brown Fruit', 'Other']
550
+
551
+ # Test with a single column
552
+ result_single = self.dp.apply_lookup_table_for_columns(df.copy(), col_names=['col1'], to_find_dict=lookup_dict)
553
+ pd.testing.assert_frame_equal(result_single, expected_output_single)
554
+
555
+ # Expected output with multiple column lookup
556
+ expected_output_multiple = df.copy()
557
+ expected_output_multiple['Mapping'] = ['Other', 'Other', 'Other', 'Brown Fruit', 'Other']
558
+
559
+ # Update lookup dictionary to match merged keys
560
+ lookup_dict_merged = {
561
+ 'date|fruit': 'Brown Fruit'
562
+ }
563
+
564
+ # Test with multiple columns
565
+ result_multiple = self.dp.apply_lookup_table_for_columns(df.copy(), col_names=['col1', 'col2'], to_find_dict=lookup_dict_merged)
566
+ pd.testing.assert_frame_equal(result_multiple, expected_output_multiple)
567
+
568
+ # Test case where no match is found
569
+ df_no_match = pd.DataFrame({'col1': ['unknown']})
570
+ expected_no_match = df_no_match.copy()
571
+ expected_no_match['Mapping'] = ['Other']
572
+ result_no_match = self.dp.apply_lookup_table_for_columns(df_no_match, col_names=['col1'], to_find_dict=lookup_dict)
573
+ pd.testing.assert_frame_equal(result_no_match, expected_no_match)
574
+
575
+ def test_aggregate_daily_to_wc_wide(self):
576
+ # Create a test DataFrame
577
+ test_data = {
578
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-08', '2023-01-09', '2023-01-10'],
579
+ 'group': ['A', 'A', 'B', 'B', 'B'],
580
+ 'value1': [10, 20, 30, 40, None],
581
+ 'value2': [100, 200, 300, None, 500]
582
+ }
583
+ df = pd.DataFrame(test_data)
584
+
585
+ # Expected output for weekly aggregation in wide format
586
+ expected_output = pd.DataFrame({
587
+ 'OBS': ['2023-01-01', '2023-01-08'], # Weeks starting on Sunday
588
+ 'value1_A': [30.0, 0.0],
589
+ 'value1_B': [0.0, 70.0],
590
+ 'value2_A': [300.0, 0.0],
591
+ 'value2_B': [0.0, 800.0],
592
+ 'Total value1': [30.0, 70.0],
593
+ 'Total value2': [300.0, 800.0]
594
+ })
595
+
596
+ # Test aggregation with totals included
597
+ result = self.dp.aggregate_daily_to_wc_wide(
598
+ df=df.copy(),
599
+ date_column='date',
600
+ group_columns=['group'],
601
+ sum_columns=['value1', 'value2'],
602
+ wc='sun',
603
+ aggregation='sum',
604
+ include_totals=True
605
+ )
606
+
607
+ # Ensure 'OBS' columns are datetime for comparison
608
+ result['OBS'] = pd.to_datetime(result['OBS'])
609
+ expected_output['OBS'] = pd.to_datetime(expected_output['OBS'])
610
+
611
+ # Compare the resulting DataFrame with the expected DataFrame
612
+ pd.testing.assert_frame_equal(result, expected_output)
613
+
614
+ # Test without group columns (no totals, single wide column)
615
+ expected_output_no_group = pd.DataFrame({
616
+ 'OBS': ['2023-01-01', '2023-01-08'],
617
+ 'value1': [30.0, 70.0],
618
+ 'value2': [300.0, 800.0]
619
+ })
620
+
621
+ result_no_group = self.dp.aggregate_daily_to_wc_wide(
622
+ df=df.copy(),
623
+ date_column='date',
624
+ group_columns=[],
625
+ sum_columns=['value1', 'value2'],
626
+ wc='sun',
627
+ aggregation='sum',
628
+ include_totals=False
629
+ )
630
+
631
+ # Ensure 'OBS' columns are datetime for comparison
632
+ result_no_group['OBS'] = pd.to_datetime(result_no_group['OBS'])
633
+ expected_output_no_group['OBS'] = pd.to_datetime(expected_output_no_group['OBS'])
634
+
635
+ # Compare the resulting DataFrame with the expected DataFrame
636
+ pd.testing.assert_frame_equal(result_no_group, expected_output_no_group)
637
+
638
+ def test_merge_cols_with_seperator(self):
639
+ # Create a test DataFrame
640
+ test_data = {
641
+ 'col1': ['apple', 'banana', 'cherry'],
642
+ 'col2': ['red', 'yellow', 'red'],
643
+ 'col3': ['fruit', 'fruit', 'fruit']
644
+ }
645
+ df = pd.DataFrame(test_data)
646
+
647
+ # Test merging two columns with default separator
648
+ expected_output_default = df.copy()
649
+ expected_output_default['Merged'] = ['apple_red', 'banana_yellow', 'cherry_red']
650
+
651
+ result_default = self.dp.merge_cols_with_seperator(df.copy(), col_names=['col1', 'col2'])
652
+ pd.testing.assert_frame_equal(result_default, expected_output_default)
653
+
654
+ # Test merging three columns with custom separator
655
+ expected_output_custom = df.copy()
656
+ expected_output_custom['Merged'] = ['apple-red-fruit', 'banana-yellow-fruit', 'cherry-red-fruit']
657
+
658
+ result_custom = self.dp.merge_cols_with_seperator(df.copy(), col_names=['col1', 'col2', 'col3'], seperator='-')
659
+ pd.testing.assert_frame_equal(result_custom, expected_output_custom)
660
+
661
+ # Test merging with starting and ending prefix
662
+ expected_output_prefix = df.copy()
663
+ expected_output_prefix['Merged'] = ['Start:apple_red:End', 'Start:banana_yellow:End', 'Start:cherry_red:End']
664
+
665
+ result_prefix = self.dp.merge_cols_with_seperator(
666
+ df.copy(),
667
+ col_names=['col1', 'col2'],
668
+ seperator='_',
669
+ starting_prefix_str='Start:',
670
+ ending_prefix_str=':End'
671
+ )
672
+ pd.testing.assert_frame_equal(result_prefix, expected_output_prefix)
673
+
674
+ # Test error for less than two columns
675
+ with self.assertRaises(ValueError):
676
+ self.dp.merge_cols_with_seperator(df.copy(), col_names=['col1'])
677
+
678
+ def test_check_sum_of_df_cols_are_equal(self):
679
+ # Create test DataFrames
680
+ df1 = pd.DataFrame({
681
+ 'col1': [1, 2, 3],
682
+ 'col2': [4, 5, 6]
683
+ })
684
+
685
+ df2 = pd.DataFrame({
686
+ 'colA': [1, 2, 3],
687
+ 'colB': [4, 5, 6]
688
+ })
689
+
690
+ df3 = pd.DataFrame({
691
+ 'colX': [1, 2, 3],
692
+ 'colY': [4, 5, 7]
693
+ })
694
+
695
+ # Test case where sums are equal
696
+ result_equal = self.dp.check_sum_of_df_cols_are_equal(df1, df2, cols_1=['col1', 'col2'], cols_2=['colA', 'colB'])
697
+ self.assertEqual(result_equal[0], "They are equal")
698
+ self.assertEqual(result_equal[1], 21) # Sum of df1's columns
699
+ self.assertEqual(result_equal[2], 21) # Sum of df2's columns
700
+
701
+ # Test case where sums are not equal
702
+ result_not_equal = self.dp.check_sum_of_df_cols_are_equal(df1, df3, cols_1=['col1', 'col2'], cols_2=['colX', 'colY'])
703
+ self.assertTrue(result_not_equal[0].startswith("They are different by "))
704
+ self.assertEqual(result_not_equal[1], 21) # Sum of df1's columns
705
+ self.assertEqual(result_not_equal[2], 22) # Sum of df3's columns
706
+
707
+ # Test case with mismatched column names
708
+ with self.assertRaises(KeyError):
709
+ self.dp.check_sum_of_df_cols_are_equal(df1, df2, cols_1=['nonexistent_col'], cols_2=['colA', 'colB'])
710
+
711
+ # Test case with empty columns
712
+ result_empty_cols = self.dp.check_sum_of_df_cols_are_equal(df1, df2, cols_1=[], cols_2=[])
713
+ self.assertEqual(result_empty_cols[1], 0) # Sum of empty columns
714
+ self.assertEqual(result_empty_cols[2], 0) # Sum of empty columns
715
+ self.assertEqual(result_empty_cols[0], "They are equal")
716
+
717
+ def test_convert_2_df_cols_to_dict(self):
718
+ # Create a test DataFrame
719
+ df = pd.DataFrame({
720
+ 'key_col': ['key1', 'key2', 'key3'],
721
+ 'value_col': [10, 20, 30]
722
+ })
723
+
724
+ # Expected dictionary
725
+ expected_dict = {
726
+ 'key1': 10,
727
+ 'key2': 20,
728
+ 'key3': 30
729
+ }
730
+
731
+ # Test basic functionality
732
+ result = self.dp.convert_2_df_cols_to_dict(df, 'key_col', 'value_col')
733
+ self.assertEqual(result, expected_dict)
734
+
735
+ # Test with non-unique keys
736
+ df_non_unique = pd.DataFrame({
737
+ 'key_col': ['key1', 'key2', 'key1'],
738
+ 'value_col': [10, 20, 30]
739
+ })
740
+ expected_dict_non_unique = {
741
+ 'key1': 30, # Last occurrence of 'key1' should overwrite the earlier one
742
+ 'key2': 20
743
+ }
744
+ result_non_unique = self.dp.convert_2_df_cols_to_dict(df_non_unique, 'key_col', 'value_col')
745
+ self.assertEqual(result_non_unique, expected_dict_non_unique)
746
+
747
+ # Test with missing key or value column
748
+ with self.assertRaises(ValueError):
749
+ self.dp.convert_2_df_cols_to_dict(df, 'missing_key_col', 'value_col')
750
+
751
+ with self.assertRaises(ValueError):
752
+ self.dp.convert_2_df_cols_to_dict(df, 'key_col', 'missing_value_col')
753
+
754
+ # Test with empty DataFrame
755
+ df_empty = pd.DataFrame(columns=['key_col', 'value_col'])
756
+ expected_empty_dict = {}
757
+ result_empty = self.dp.convert_2_df_cols_to_dict(df_empty, 'key_col', 'value_col')
758
+ self.assertEqual(result_empty, expected_empty_dict)
759
+
760
+ def test_keyword_lookup_replacement(self):
761
+ # Create a test DataFrame
762
+ test_data = {
763
+ 'col1': ['A', 'B', 'C', 'D'],
764
+ 'col2': ['X', 'Y', 'Z', 'W'],
765
+ 'value_col': ['old_value', 'old_value', 'unchanged', 'old_value']
766
+ }
767
+ df = pd.DataFrame(test_data)
768
+
769
+ # Lookup dictionary for replacements
770
+ lookup_dict = {
771
+ 'A|X': 'new_value_1',
772
+ 'B|Y': 'new_value_2',
773
+ 'D|W': 'new_value_3'
774
+ }
775
+
776
+ # Expected output
777
+ expected_output = df.copy()
778
+ expected_output['Updated Column'] = ['new_value_1', 'new_value_2', 'unchanged', 'new_value_3']
779
+
780
+ # Apply the function
781
+ result = self.dp.keyword_lookup_replacement(
782
+ df.copy(),
783
+ col='value_col',
784
+ replacement_rows='old_value',
785
+ cols_to_merge=['col1', 'col2'],
786
+ replacement_lookup_dict=lookup_dict
787
+ )
788
+
789
+ # Compare the resulting DataFrame with the expected DataFrame
790
+ pd.testing.assert_frame_equal(result, expected_output)
791
+
792
+ # Test case where no replacement is needed
793
+ df_no_replacement = pd.DataFrame({
794
+ 'col1': ['E', 'F'],
795
+ 'col2': ['G', 'H'],
796
+ 'value_col': ['unchanged', 'unchanged']
797
+ })
798
+ expected_no_replacement = df_no_replacement.copy()
799
+ expected_no_replacement['Updated Column'] = ['unchanged', 'unchanged']
800
+
801
+ result_no_replacement = self.dp.keyword_lookup_replacement(
802
+ df_no_replacement.copy(),
803
+ col='value_col',
804
+ replacement_rows='old_value',
805
+ cols_to_merge=['col1', 'col2'],
806
+ replacement_lookup_dict=lookup_dict
807
+ )
808
+
809
+ pd.testing.assert_frame_equal(result_no_replacement, expected_no_replacement)
810
+
811
+ def test_convert_df_wide_2_long(self):
812
+ # Create a test DataFrame
813
+ test_data = {
814
+ 'id': [1, 2, 3],
815
+ 'name': ['Alice', 'Bob', 'Charlie'],
816
+ 'score1': [85, 90, 78],
817
+ 'score2': [88, 92, 81]
818
+ }
819
+ df = pd.DataFrame(test_data)
820
+
821
+ # Expected output for the transformation
822
+ expected_output = pd.DataFrame({
823
+ 'id': [1, 2, 3, 1, 2, 3],
824
+ 'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'Charlie'],
825
+ 'Stacked': ['score1', 'score1', 'score1', 'score2', 'score2', 'score2'],
826
+ 'Value': [85, 90, 78, 88, 92, 81]
827
+ })
828
+
829
+ # Apply the function
830
+ result = self.dp.convert_df_wide_2_long(
831
+ df.copy(),
832
+ value_cols=['score1', 'score2'],
833
+ variable_col_name='Stacked',
834
+ value_col_name='Value'
835
+ )
836
+
837
+ # Compare the resulting DataFrame with the expected DataFrame
838
+ pd.testing.assert_frame_equal(result, expected_output)
839
+
840
+ # Test case with only one column (should raise ValueError)
841
+ with self.assertRaises(ValueError):
842
+ self.dp.convert_df_wide_2_long(
843
+ df.copy(),
844
+ value_cols=['score1'],
845
+ variable_col_name='Stacked',
846
+ value_col_name='Value'
847
+ )
848
+
849
+ # Test case with no value columns (should raise ValueError)
850
+ with self.assertRaises(ValueError):
851
+ self.dp.convert_df_wide_2_long(
852
+ df.copy(),
853
+ value_cols=[],
854
+ variable_col_name='Stacked',
855
+ value_col_name='Value'
856
+ )
857
+
858
+ def test_format_numbers_with_commas(self):
859
+ # Create a test DataFrame
860
+ test_data = {
861
+ 'col1': [1000, 2500000, 12345.678, None],
862
+ 'col2': [2000.5, 350000.75, 0, -12345],
863
+ 'col3': ['text', 'another text', 50000, 123.45]
864
+ }
865
+ df = pd.DataFrame(test_data).fillna(value=pd.NA) # Normalize None to pd.NA
866
+
867
+ # Expected output with 2 decimal places
868
+ expected_data = {
869
+ 'col1': ['1,000.00', '2,500,000.00', '12,345.68', pd.NA],
870
+ 'col2': ['2,000.50', '350,000.75', '0.00', '-12,345.00'],
871
+ 'col3': ['text', 'another text', '50,000.00', '123.45']
872
+ }
873
+ expected_output = pd.DataFrame(expected_data)
874
+
875
+ # Apply the function
876
+ result = self.dp.format_numbers_with_commas(df, decimal_length_chosen=2)
877
+
878
+ # Compare the resulting DataFrame with the expected DataFrame
879
+ pd.testing.assert_frame_equal(result, expected_output, check_dtype=False)
880
+
881
+ def test_filter_df_on_multiple_conditions(self):
882
+ # Create a test DataFrame
883
+ test_data = {
884
+ 'id': [1, 2, 3, 4, 5],
885
+ 'value': [10, 20, 30, 40, 50],
886
+ 'category': ['A', 'B', 'A', 'C', 'A'],
887
+ 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'])
888
+ }
889
+ df = pd.DataFrame(test_data)
890
+
891
+ # Test Case 1: Single condition (Equality)
892
+ filters_dict = {'category': "== 'A'"}
893
+ expected_output = df[df['category'] == 'A']
894
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
895
+ pd.testing.assert_frame_equal(result, expected_output)
896
+
897
+ # Test Case 2: Multiple conditions (Equality and Greater Than)
898
+ filters_dict = {'category': "== 'A'", 'value': "> 20"}
899
+ expected_output = df[(df['category'] == 'A') & (df['value'] > 20)]
900
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
901
+ pd.testing.assert_frame_equal(result, expected_output)
902
+
903
+ # Test Case 3: Date comparison
904
+ filters_dict = {'date': ">= '2023-01-03'"}
905
+ expected_output = df[df['date'] >= pd.to_datetime('2023-01-03')]
906
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
907
+ pd.testing.assert_frame_equal(result, expected_output)
908
+
909
+ # Test Case 4: Inequality
910
+ filters_dict = {'value': "!= 30"}
911
+ expected_output = df[df['value'] != 30]
912
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
913
+ pd.testing.assert_frame_equal(result, expected_output)
914
+
915
+ # Test Case 5: Mixed conditions
916
+ filters_dict = {'category': "== 'A'", 'date': "<= '2023-01-03'"}
917
+ expected_output = df[(df['category'] == 'A') & (df['date'] <= pd.to_datetime('2023-01-03'))]
918
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
919
+ pd.testing.assert_frame_equal(result, expected_output)
920
+
921
+ def test_remove_zero_values(self):
922
+ # Create a test DataFrame
923
+ test_data = {
924
+ 'id': [1, 2, 3, 4, 5],
925
+ 'value': [0, 10, 20, 0, 30],
926
+ 'category': ['A', 'B', 'A', 'C', 'A']
927
+ }
928
+ df = pd.DataFrame(test_data)
929
+
930
+ # Test Case 1: Expected output after removing rows where 'value' is 0
931
+ expected_output = df[df['value'] != 0]
932
+ result = self.dp.remove_zero_values(df, 'value')
933
+ pd.testing.assert_frame_equal(result, expected_output)
934
+
935
+ # Test Case 2: No zero values
936
+ df_no_zeros = df[df['value'] != 0]
937
+ expected_output_no_change = df_no_zeros.copy()
938
+ result_no_change = self.dp.remove_zero_values(df_no_zeros, 'value')
939
+ pd.testing.assert_frame_equal(result_no_change, expected_output_no_change)
940
+
941
+ # Test Case 3: Column with only zero values
942
+ df_zeros = pd.DataFrame({'value': [0, 0, 0, 0, 0]})
943
+ expected_output_empty = pd.DataFrame({'value': []}, dtype=df_zeros['value'].dtype) # Ensure dtype matches
944
+ result_empty = self.dp.remove_zero_values(df_zeros, 'value')
945
+ pd.testing.assert_frame_equal(result_empty, expected_output_empty)
946
+
947
+ def test_fill_weekly_date_range(self):
948
+ # Test input DataFrame
949
+ test_data = {
950
+ 'date': ['2023-01-02', '2023-01-16', '2023-01-30'], # Weekly data with gaps
951
+ 'value': [10.0, 20.0, 30.0]
952
+ }
953
+ df = pd.DataFrame(test_data)
954
+ df['date'] = pd.to_datetime(df['date'])
955
+
956
+ # Expected output DataFrame
957
+ expected_data = {
958
+ 'date': ['2023-01-02', '2023-01-09', '2023-01-16', '2023-01-23', '2023-01-30'],
959
+ 'value': [10.0, 0.0, 20.0, 0.0, 30.0]
960
+ }
961
+ expected_output = pd.DataFrame(expected_data)
962
+ expected_output['date'] = pd.to_datetime(expected_output['date'])
963
+
964
+ # Call the function
965
+ dp = dataprocessing() # Replace with the correct instantiation of your class
966
+ result = dp.fill_weekly_date_range(df, date_column='date', freq='W-MON')
967
+
968
+ # Assert the result matches the expected output
969
+ pd.testing.assert_frame_equal(result.reset_index(drop=True), expected_output.reset_index(drop=True))
970
+
971
+ def test_add_prefix_and_suffix(self):
972
+ # Test DataFrame
973
+ test_data = {
974
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
975
+ 'value1': [10, 20, 30],
976
+ 'value2': [40, 50, 60]
977
+ }
978
+ df = pd.DataFrame(test_data)
979
+
980
+ # Expected output when no date column is excluded
981
+ expected_data_no_date_col = {
982
+ 'prefix_date_suffix': ['2023-01-01', '2023-01-02', '2023-01-03'],
983
+ 'prefix_value1_suffix': [10, 20, 30],
984
+ 'prefix_value2_suffix': [40, 50, 60]
985
+ }
986
+ expected_output_no_date_col = pd.DataFrame(expected_data_no_date_col)
987
+
988
+ # Expected output when date column is excluded
989
+ expected_data_with_date_col = {
990
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
991
+ 'prefix_value1_suffix': [10, 20, 30],
992
+ 'prefix_value2_suffix': [40, 50, 60]
993
+ }
994
+ expected_output_with_date_col = pd.DataFrame(expected_data_with_date_col)
995
+
996
+ # Call the function without excluding a date column
997
+ dp = dataprocessing() # Replace with the correct instantiation of your class
998
+ result_no_date_col = dp.add_prefix_and_suffix(df.copy(), prefix='prefix_', suffix='_suffix')
999
+
1000
+ # Assert result matches the expected output
1001
+ pd.testing.assert_frame_equal(result_no_date_col, expected_output_no_date_col)
1002
+
1003
+ # Call the function with a date column excluded
1004
+ result_with_date_col = dp.add_prefix_and_suffix(df.copy(), prefix='prefix_', suffix='_suffix', date_col='date')
1005
+
1006
+ # Assert result matches the expected output
1007
+ pd.testing.assert_frame_equal(result_with_date_col, expected_output_with_date_col)
1008
+
1009
+ def test_create_dummies(self):
1010
+ # Test Case 1: Basic functionality without date column
1011
+ df = pd.DataFrame({
1012
+ 'col1': [0, 1, 2],
1013
+ 'col2': [3, 4, 0],
1014
+ 'col3': [5, 0, 0]
1015
+ })
1016
+ dummy_threshold = 1
1017
+ expected_output = pd.DataFrame({
1018
+ 'col1': [0, 0, 1],
1019
+ 'col2': [1, 1, 0],
1020
+ 'col3': [1, 0, 0]
1021
+ })
1022
+ result = self.dp.create_dummies(df.copy(), dummy_threshold=dummy_threshold)
1023
+ pd.testing.assert_frame_equal(result, expected_output)
1024
+
1025
+ # Test Case 2: With date column
1026
+ df_with_date = pd.DataFrame({
1027
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
1028
+ 'col1': [0, 1, 2],
1029
+ 'col2': [3, 4, 0]
1030
+ })
1031
+ expected_output_with_date = pd.DataFrame({
1032
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
1033
+ 'col1': [0, 0, 1],
1034
+ 'col2': [1, 1, 0]
1035
+ })
1036
+ result_with_date = self.dp.create_dummies(df_with_date.copy(), date_col='date', dummy_threshold=dummy_threshold)
1037
+ pd.testing.assert_frame_equal(result_with_date, expected_output_with_date)
1038
+
1039
+ # Test Case 3: Adding total dummy column
1040
+ expected_output_with_total = expected_output.copy()
1041
+ expected_output_with_total['total'] = [1, 1, 1]
1042
+ result_with_total = self.dp.create_dummies(df.copy(), dummy_threshold=dummy_threshold, add_total_dummy_col='Yes')
1043
+ pd.testing.assert_frame_equal(result_with_total, expected_output_with_total)
1044
+
1045
+ # Test Case 4: Adding total dummy column with date column
1046
+ expected_output_with_date_and_total = expected_output_with_date.copy()
1047
+ expected_output_with_date_and_total['total'] = [1, 1, 1]
1048
+ result_with_date_and_total = self.dp.create_dummies(
1049
+ df_with_date.copy(),
1050
+ date_col='date',
1051
+ dummy_threshold=dummy_threshold,
1052
+ add_total_dummy_col='Yes',
1053
+ )
1054
+ pd.testing.assert_frame_equal(result_with_date_and_total, expected_output_with_date_and_total)
1055
+
1056
+ # Test Case 5: Threshold of 0 (all positive numbers become 1)
1057
+ df_threshold_0 = pd.DataFrame({
1058
+ 'col1': [-1, 0, 1],
1059
+ 'col2': [0, 2, -3]
1060
+ })
1061
+ expected_output_threshold_0 = pd.DataFrame({
1062
+ 'col1': [0, 0, 1],
1063
+ 'col2': [0, 1, 0]
1064
+ })
1065
+ result_threshold_0 = self.dp.create_dummies(df_threshold_0.copy(), dummy_threshold=0)
1066
+ pd.testing.assert_frame_equal(result_threshold_0, expected_output_threshold_0)
1067
+
1068
+ def test_replace_substrings(self):
1069
+ # Test Case 1: Basic replacement
1070
+ df = pd.DataFrame({
1071
+ 'text': ['hello world', 'python programming', 'hello python']
1072
+ })
1073
+ replacements = {'hello': 'hi', 'python': 'java'}
1074
+ expected_output = pd.DataFrame({
1075
+ 'text': ['hi world', 'java programming', 'hi java']
1076
+ })
1077
+ result = self.dp.replace_substrings(df.copy(), 'text', replacements)
1078
+ pd.testing.assert_frame_equal(result, expected_output)
1079
+
1080
+ # Test Case 2: Replacement with to_lower=True
1081
+ df_mixed_case = pd.DataFrame({
1082
+ 'text': ['Hello World', 'PYTHON Programming', 'hello PYTHON']
1083
+ })
1084
+ expected_output_lower = pd.DataFrame({
1085
+ 'text': ['hi world', 'java programming', 'hi java']
1086
+ })
1087
+ result_lower = self.dp.replace_substrings(df_mixed_case.copy(), 'text', replacements, to_lower=True)
1088
+ pd.testing.assert_frame_equal(result_lower, expected_output_lower)
1089
+
1090
+ # Test Case 3: Replacement with a new column
1091
+ df_new_col = pd.DataFrame({
1092
+ 'text': ['hello world', 'python programming', 'hello python']
1093
+ })
1094
+ expected_output_new_col = pd.DataFrame({
1095
+ 'text': ['hello world', 'python programming', 'hello python'],
1096
+ 'new_text': ['hi world', 'java programming', 'hi java']
1097
+ })
1098
+ result_new_col = self.dp.replace_substrings(df_new_col.copy(), 'text', replacements, new_column='new_text')
1099
+ pd.testing.assert_frame_equal(result_new_col, expected_output_new_col)
1100
+
1101
+ def test_add_total_column(self):
1102
+ # Test Case 1: Basic functionality without excluding any column
1103
+ df = pd.DataFrame({
1104
+ 'col1': [1, 2, 3],
1105
+ 'col2': [4, 5, 6],
1106
+ 'col3': [7, 8, 9]
1107
+ })
1108
+ expected_output = df.copy()
1109
+ expected_output['Total'] = [12, 15, 18]
1110
+ result = self.dp.add_total_column(df.copy())
1111
+ pd.testing.assert_frame_equal(result, expected_output)
1112
+
1113
+ # Test Case 2: Excluding a column from the total
1114
+ df = pd.DataFrame({
1115
+ 'col1': [1, 2, 3],
1116
+ 'col2': [4, 5, 6],
1117
+ 'col3': [7, 8, 9]
1118
+ })
1119
+ expected_output_exclude = pd.DataFrame({
1120
+ 'col1': [1, 2, 3],
1121
+ 'col2': [4, 5, 6],
1122
+ 'col3': [7, 8, 9],
1123
+ 'Total': [5, 7, 9] # Sum without 'col3'
1124
+ })
1125
+ result_exclude = self.dp.add_total_column(df.copy(), exclude_col='col3')
1126
+ pd.testing.assert_frame_equal(result_exclude, expected_output_exclude)
1127
+
1128
+ # Test Case 3: Custom total column name
1129
+ custom_total_col_name = 'Sum'
1130
+ expected_output_custom = df.copy()
1131
+ expected_output_custom[custom_total_col_name] = [12, 15, 18]
1132
+ result_custom = self.dp.add_total_column(df.copy(), total_col_name=custom_total_col_name)
1133
+ pd.testing.assert_frame_equal(result_custom, expected_output_custom)
1134
+
1135
+ # Test Case 4: DataFrame with a single column
1136
+ single_col_df = pd.DataFrame({'col1': [1, 2, 3]})
1137
+ expected_single_col = single_col_df.copy()
1138
+ expected_single_col['Total'] = [1, 2, 3]
1139
+ result_single_col = self.dp.add_total_column(single_col_df.copy())
1140
+ pd.testing.assert_frame_equal(result_single_col, expected_single_col)
1141
+
1142
+ def test_apply_lookup_table_based_on_substring(self):
1143
+ # Test Case 1: Basic categorization
1144
+ df = pd.DataFrame({
1145
+ 'text': ['I love apples', 'Bananas are great', 'Something else', 'Grapes are sour']
1146
+ })
1147
+ category_dict = {
1148
+ 'apple': 'Fruit',
1149
+ 'banana': 'Fruit',
1150
+ 'cherry': 'Fruit',
1151
+ 'grape': 'Fruit'
1152
+ }
1153
+ expected_output = pd.DataFrame({
1154
+ 'text': ['I love apples', 'Bananas are great', 'Something else', 'Grapes are sour'],
1155
+ 'Category': ['Fruit', 'Fruit', 'Other', 'Fruit']
1156
+ })
1157
+ result = self.dp.apply_lookup_table_based_on_substring(df.copy(), 'text', category_dict)
1158
+ pd.testing.assert_frame_equal(result, expected_output)
1159
+
1160
+ def test_compare_overlap(self):
1161
+ """
1162
+ Test the compare_overlap function to ensure it calculates differences
1163
+ and their totals correctly across overlapping date ranges.
1164
+ """
1165
+ # 1. Create sample data for df1 (covers 2021-01-01 to 2021-01-04)
1166
+ df1_data = [
1167
+ {"date": "2021-01-01", "value": 10, "count": 1},
1168
+ {"date": "2021-01-02", "value": 15, "count": 2},
1169
+ {"date": "2021-01-03", "value": 20, "count": 3},
1170
+ {"date": "2021-01-04", "value": 25, "count": 4},
1171
+ ]
1172
+ df1 = pd.DataFrame(df1_data)
1173
+
1174
+ # 2. Create sample data for df2 (covers 2021-01-03 to 2021-01-05)
1175
+ df2_data = [
1176
+ {"date": "2021-01-03", "value": 22, "count": 2},
1177
+ {"date": "2021-01-04", "value": 20, "count": 5},
1178
+ {"date": "2021-01-05", "value": 30, "count": 6},
1179
+ ]
1180
+ df2 = pd.DataFrame(df2_data)
1181
+
1182
+ # 3. Call compare_overlap from your dataprocessing class
1183
+ diff_df, total_diff_df = self.dp.compare_overlap(df1, df2, 'date')
1184
+ expected_diff_df = pd.DataFrame({
1185
+ 'date': pd.to_datetime(['2021-01-03', '2021-01-04']),
1186
+ 'diff_value': [-2, 5],
1187
+ 'diff_count': [1, -1]
1188
+ })
1189
+
1190
+ expected_total_diff_df = pd.DataFrame({
1191
+ 'Column': ['value', 'count'],
1192
+ 'Total Difference': [3, 0]
1193
+ })
1194
+
1195
+ # 5. Use pd.testing.assert_frame_equal to check the outputs
1196
+ # Sort and reset index to ensure matching row order
1197
+ pd.testing.assert_frame_equal(
1198
+ diff_df.sort_values('date').reset_index(drop=True),
1199
+ expected_diff_df.sort_values('date').reset_index(drop=True)
1200
+ )
1201
+
1202
+ # Sort by 'Column' to ensure matching row order in summary
1203
+ pd.testing.assert_frame_equal(
1204
+ total_diff_df.sort_values('Column').reset_index(drop=True),
1205
+ expected_total_diff_df.sort_values('Column').reset_index(drop=True)
1206
+ )
1207
+
1208
+ def test_week_commencing_2_week_commencing_conversion_isoweekday(self):
1209
+ """
1210
+ Test the isoweekday-based function to confirm each date is mapped back
1211
+ to the 'week_commencing' day of that ISO week.
1212
+ """
1213
+ # 2023-01-01 was a Sunday; we'll go through Saturday (7 days).
1214
+ df = pd.DataFrame({"date": pd.date_range("2023-01-01", periods=7, freq="D")})
1215
+ expected_mon = pd.Series(
1216
+ [
1217
+ pd.Timestamp("2022-12-26"), # Sunday -> previous Monday
1218
+ pd.Timestamp("2023-01-02"), # Monday
1219
+ pd.Timestamp("2023-01-02"), # Tuesday
1220
+ pd.Timestamp("2023-01-02"), # Wednesday
1221
+ pd.Timestamp("2023-01-02"), # Thursday
1222
+ pd.Timestamp("2023-01-02"), # Friday
1223
+ pd.Timestamp("2023-01-02"), # Saturday
1224
+ ],
1225
+ name="week_start_mon"
1226
+ )
1227
+
1228
+ # Use the new function from our data processing object
1229
+ result = self.dp.week_commencing_2_week_commencing_conversion_isoweekday(
1230
+ df.copy(),
1231
+ date_col="date",
1232
+ week_commencing="mon"
1233
+ )
1234
+
1235
+ # Compare the 'week_start_mon' column with our expected results
1236
+ pd.testing.assert_series_equal(
1237
+ result["week_start_mon"], # actual
1238
+ expected_mon # expected
1239
+ )
1240
+
1241
+ def test_plotly_chart(self):
1242
+ """
1243
+ Test the plot_chart function, creating the data inside the test and
1244
+ verifying that the resulting figure has the correct traces and layout.
1245
+ """
1246
+ # Instantiate your data processing class (which includes plot_chart).
1247
+ dp = dataprocessing()
1248
+
1249
+ # Create a small DataFrame for testing (inline)
1250
+ df = pd.DataFrame({
1251
+ 'date': pd.date_range(start='2023-01-01', periods=5, freq='D'),
1252
+ 'A': range(5), # 0,1,2,3,4
1253
+ 'B': range(10, 15) # 10,11,12,13,14
1254
+ })
1255
+
1256
+ # We'll test a few common chart types. Add more if desired.
1257
+ chart_types_to_test = ['line', 'bar', 'scatter']
1258
+
1259
+ for ctype in chart_types_to_test:
1260
+ with self.subTest(chart_type=ctype):
1261
+ # Call the plot_chart function
1262
+ fig = dp.plot_chart(
1263
+ df=df,
1264
+ date_col='date',
1265
+ value_cols=['A', 'B'],
1266
+ chart_type=ctype,
1267
+ title='Test Chart',
1268
+ x_title='Date',
1269
+ y_title='Values'
1270
+ )
1271
+
1272
+ # Basic checks
1273
+ self.assertIsInstance(fig, go.Figure, f"Expected a go.Figure for chart_type={ctype}")
1274
+ self.assertEqual(len(fig.data), 2, f"Expected 2 traces for chart_type={ctype}")
1275
+
1276
+ # Verify the figure title, axis labels, etc.
1277
+ self.assertEqual(fig.layout.title.text, 'Test Chart')
1278
+ self.assertEqual(fig.layout.xaxis.title.text, 'Date')
1279
+ self.assertEqual(fig.layout.yaxis.title.text, 'Values')
1280
+
1281
+ # Check each trace's name and data
1282
+ # The first trace should correspond to column 'A', second to 'B'
1283
+ trace1 = fig.data[0]
1284
+ trace2 = fig.data[1]
1285
+
1286
+ # Check the trace names
1287
+ self.assertEqual(trace1.name, 'A', f"Trace 1 should be named 'A' for chart_type={ctype}")
1288
+ self.assertEqual(trace2.name, 'B', f"Trace 2 should be named 'B' for chart_type={ctype}")
1289
+
1290
+ # Check that x-values match the date range
1291
+ self.assertListEqual(
1292
+ list(trace1.x),
1293
+ list(df['date']),
1294
+ f"Trace 1 x-values do not match df['date'] for chart_type={ctype}"
1295
+ )
1296
+ self.assertListEqual(
1297
+ list(trace2.x),
1298
+ list(df['date']),
1299
+ f"Trace 2 x-values do not match df['date'] for chart_type={ctype}"
1300
+ )
1301
+
1302
+ # Check that y-values match A and B columns
1303
+ self.assertListEqual(
1304
+ list(trace1.y),
1305
+ list(df['A']),
1306
+ f"Trace 1 y-values do not match df['A'] for chart_type={ctype}"
1307
+ )
1308
+ self.assertListEqual(
1309
+ list(trace2.y),
1310
+ list(df['B']),
1311
+ f"Trace 2 y-values do not match df['B'] for chart_type={ctype}"
1312
+ )
1313
+
1314
+ # Optional: Additional checks for specific chart types
1315
+ if ctype == 'line':
1316
+ self.assertEqual(trace1.mode, 'lines', "Expected line mode for chart_type='line'")
1317
+ self.assertEqual(trace2.mode, 'lines', "Expected line mode for chart_type='line'")
1318
+ elif ctype == 'bar':
1319
+ self.assertEqual(trace1.type, 'bar', "Expected bar type for chart_type='bar'")
1320
+ self.assertEqual(trace2.type, 'bar', "Expected bar type for chart_type='bar'")
1321
+ elif ctype == 'scatter':
1322
+ # scatter uses markers by default in our code
1323
+ self.assertEqual(trace1.mode, 'markers', "Expected marker mode for chart_type='scatter'")
1324
+ self.assertEqual(trace2.mode, 'markers', "Expected marker mode for chart_type='scatter'")
1325
+
1326
+
1327
+
1328
+ ###################################################################################################################################################
1329
+ ###################################################################################################################################################
1330
+
1331
+ # class TestDataPull()
1332
+
1333
+
1334
+ if __name__ == '__main__':
1335
+ unittest.main()