imsciences 1.0.2__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/unittesting.py DELETED
@@ -1,1314 +0,0 @@
1
- import os
2
- import unittest
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from mmm import dataprocessing
7
-
8
-
9
- class TestDataProcessor(unittest.TestCase):
10
- def setUp(self):
11
- self.dp = dataprocessing()
12
- self.df = pd.DataFrame(
13
- {
14
- "date": pd.date_range(start="2023-01-01", periods=10, freq="D"),
15
- "value1": range(10),
16
- "value2": range(10, 20),
17
- },
18
- )
19
- self.mixed_date_df = pd.DataFrame(
20
- {"mixed_date": ["2023-01-01", "01/02/2023", "2023/03/01", "2023-04-01"]},
21
- )
22
- self.merged_df = pd.DataFrame(
23
- {"col1": ["A", "B", "C"], "col2": ["X", "Y", "Z"]},
24
- )
25
-
26
- def test_get_wd_levels(self):
27
- current_dir = os.getcwd()
28
- parent_dir = self.dp.get_wd_levels(1)
29
- self.assertEqual(parent_dir, os.path.dirname(current_dir))
30
-
31
- def test_aggregate_daily_to_wc_long(self):
32
- # Create a test DataFrame
33
- test_data = {
34
- "date": [
35
- "2023-01-01",
36
- "2023-01-02",
37
- "2023-01-08",
38
- "2023-01-09",
39
- "2023-01-10",
40
- ],
41
- "group_col": ["A", "A", "B", "B", "B"],
42
- "value1": [10, 20, 30, 40, np.nan],
43
- "value2": [100, 200, 300, np.nan, 500],
44
- }
45
- df = pd.DataFrame(test_data)
46
-
47
- # Expected output for different test cases
48
- expected_sum_output = pd.DataFrame(
49
- {
50
- "OBS": ["2023-01-01", "2023-01-08"], # Week starting on Sunday
51
- "group_col": ["A", "B"],
52
- "value1": [30.0, 70.0],
53
- "value2": [300.0, 800.0],
54
- },
55
- )
56
-
57
- # Convert OBS column to datetime for expected DataFrame
58
- expected_sum_output["OBS"] = pd.to_datetime(expected_sum_output["OBS"])
59
-
60
- # Test sum aggregation
61
- result_sum = self.dp.aggregate_daily_to_wc_long(
62
- df,
63
- "date",
64
- ["group_col"],
65
- ["value1", "value2"],
66
- wc="sun",
67
- aggregation="sum",
68
- )
69
-
70
- # Ensure both OBS columns are datetime for comparison
71
- result_sum["OBS"] = pd.to_datetime(result_sum["OBS"])
72
-
73
- # Compare the resulting DataFrame with the expected DataFrame
74
- pd.testing.assert_frame_equal(result_sum, expected_sum_output)
75
-
76
- def test_convert_monthly_to_daily(self):
77
- # Create a test DataFrame with monthly data
78
- test_data = {
79
- "date": ["2023-01-01", "2023-02-01", "2023-03-01"],
80
- "value1": [31, 28, 31],
81
- "value2": [310, 280, 310],
82
- }
83
- df = pd.DataFrame(test_data)
84
-
85
- # Expected output DataFrame when divide=True
86
- expected_daily_data_divide = {
87
- "date": pd.date_range(start="2023-01-01", end="2023-01-31").tolist()
88
- + pd.date_range(start="2023-02-01", end="2023-02-28").tolist()
89
- + pd.date_range(start="2023-03-01", end="2023-03-31").tolist(),
90
- "value1": [1.0] * 31 + [1.0] * 28 + [1.0] * 31,
91
- "value2": [10.0] * 31 + [10.0] * 28 + [10.0] * 31,
92
- }
93
- expected_daily_df_divide = pd.DataFrame(expected_daily_data_divide)
94
-
95
- # Call the function with divide=True
96
- result_divide = self.dp.convert_monthly_to_daily(df, "date", divide=True)
97
-
98
- # Compare the resulting DataFrame with the expected DataFrame
99
- pd.testing.assert_frame_equal(
100
- result_divide.reset_index(drop=True),
101
- expected_daily_df_divide,
102
- )
103
-
104
- # Expected output DataFrame when divide=False
105
- expected_daily_data_no_divide = {
106
- "date": pd.date_range(start="2023-01-01", end="2023-01-31").tolist()
107
- + pd.date_range(start="2023-02-01", end="2023-02-28").tolist()
108
- + pd.date_range(start="2023-03-01", end="2023-03-31").tolist(),
109
- "value1": [31] * 31 + [28] * 28 + [31] * 31,
110
- "value2": [310] * 31 + [280] * 28 + [310] * 31,
111
- }
112
- expected_daily_df_no_divide = pd.DataFrame(expected_daily_data_no_divide)
113
-
114
- # Call the function with divide=False
115
- result_no_divide = self.dp.convert_monthly_to_daily(df, "date", divide=False)
116
-
117
- # Compare the resulting DataFrame with the expected DataFrame
118
- pd.testing.assert_frame_equal(
119
- result_no_divide.reset_index(drop=True),
120
- expected_daily_df_no_divide,
121
- )
122
-
123
- def test_week_of_year_mapping(self):
124
- # Create a test DataFrame with ISO week format
125
- test_data = {"week_col": ["2023-W01", "2023-W05", "2023-W10", "2023-W52"]}
126
- df = pd.DataFrame(test_data)
127
-
128
- # Expected outputs for different start days
129
- expected_output_mon = pd.DataFrame(
130
- {
131
- "week_col": ["2023-W01", "2023-W05", "2023-W10", "2023-W52"],
132
- "OBS": ["02/01/2023", "30/01/2023", "06/03/2023", "25/12/2023"],
133
- },
134
- )
135
-
136
- expected_output_sun = pd.DataFrame(
137
- {
138
- "week_col": ["2023-W01", "2023-W05", "2023-W10", "2023-W52"],
139
- "OBS": ["01/01/2023", "29/01/2023", "05/03/2023", "24/12/2023"],
140
- },
141
- )
142
-
143
- # Test mapping with Monday as start day
144
- result_mon = self.dp.week_of_year_mapping(df.copy(), "week_col", "mon")
145
- pd.testing.assert_frame_equal(result_mon, expected_output_mon)
146
-
147
- # Test mapping with Sunday as start day
148
- result_sun = self.dp.week_of_year_mapping(df.copy(), "week_col", "sun")
149
- pd.testing.assert_frame_equal(result_sun, expected_output_sun)
150
-
151
- # Test with invalid start day input
152
- with self.assertRaises(ValueError) as context:
153
- self.dp.week_of_year_mapping(df.copy(), "week_col", "invalid_day")
154
- self.assertIn("Invalid day input", str(context.exception))
155
-
156
- def test_rename_cols(self):
157
- # Create a test DataFrame
158
- test_data = {
159
- "OBS": [1, 2, 3],
160
- "Column One": [10, 20, 30],
161
- "Another Column": [100, 200, 300],
162
- "Special Characters !@#": [5, 15, 25],
163
- }
164
- df = pd.DataFrame(test_data)
165
-
166
- # Expected output with default prefix
167
- expected_output_default = pd.DataFrame(
168
- {
169
- "OBS": [1, 2, 3],
170
- "ame_column_one": [10, 20, 30],
171
- "ame_another_column": [100, 200, 300],
172
- "ame_special_characters_!@#": [5, 15, 25],
173
- },
174
- )
175
-
176
- # Expected output with custom prefix
177
- expected_output_custom = pd.DataFrame(
178
- {
179
- "OBS": [1, 2, 3],
180
- "custom_column_one": [10, 20, 30],
181
- "custom_another_column": [100, 200, 300],
182
- "custom_special_characters_!@#": [5, 15, 25],
183
- },
184
- )
185
-
186
- # Test renaming columns with default prefix
187
- result_default = self.dp.rename_cols(df)
188
- pd.testing.assert_frame_equal(result_default, expected_output_default)
189
-
190
- # Test renaming columns with custom prefix
191
- result_custom = self.dp.rename_cols(df, name="custom_")
192
- pd.testing.assert_frame_equal(result_custom, expected_output_custom)
193
-
194
- # Test that 'OBS' column remains unchanged
195
- self.assertIn("OBS", result_default.columns)
196
- self.assertIn("OBS", result_custom.columns)
197
-
198
- def test_merge_new_and_old(self):
199
- # Create test DataFrames for old and new data
200
- old_data = {
201
- "OBS": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04"],
202
- "old_values": [10, 20, 30, 40],
203
- }
204
- new_data = {
205
- "OBS": ["2023-01-04", "2023-01-05", "2023-01-06"],
206
- "new_values": [100, 200, 300],
207
- }
208
- old_df = pd.DataFrame(old_data)
209
- new_df = pd.DataFrame(new_data)
210
-
211
- # Expected output
212
- expected_output = pd.DataFrame(
213
- {
214
- "OBS": pd.to_datetime(
215
- [
216
- "2023-01-01",
217
- "2023-01-02",
218
- "2023-01-03",
219
- "2023-01-04",
220
- "2023-01-05",
221
- "2023-01-06",
222
- ],
223
- ),
224
- "new_values": [10, 20, 30, 40, 200, 300],
225
- },
226
- )
227
-
228
- # Test merging with cutoff_date='2023-01-04'
229
- result = self.dp.merge_new_and_old(
230
- old_df,
231
- "old_values",
232
- new_df,
233
- "new_values",
234
- "2023-01-04",
235
- )
236
-
237
- # Assertions
238
- pd.testing.assert_frame_equal(result, expected_output)
239
-
240
- # Test that columns are correctly renamed and sorted
241
- self.assertIn("OBS", result.columns)
242
- self.assertIn("new_values", result.columns)
243
- self.assertEqual(len(result), len(expected_output)) # Ensure row count matches
244
- self.assertTrue(
245
- (result["OBS"].diff().dropna() >= pd.Timedelta(0)).all(),
246
- ) # Check that dates are in order
247
-
248
- def test_merge_dataframes_on_column(self):
249
- # Create test DataFrames
250
- df1 = pd.DataFrame(
251
- {"OBS": ["2023-01-01", "2023-01-02", "2023-01-03"], "value1": [10, 20, 30]},
252
- )
253
- df2 = pd.DataFrame(
254
- {"OBS": ["2023-01-02", "2023-01-03", "2023-01-04"], "value2": [40, 50, 60]},
255
- )
256
- df3 = pd.DataFrame(
257
- {"OBS": ["2023-01-03", "2023-01-04", "2023-01-05"], "value3": [70, 80, 90]},
258
- )
259
-
260
- # Ensure test DataFrame columns are datetime
261
- df1["OBS"] = pd.to_datetime(df1["OBS"])
262
- df2["OBS"] = pd.to_datetime(df2["OBS"])
263
- df3["OBS"] = pd.to_datetime(df3["OBS"])
264
-
265
- # Expected output for outer merge (cast to float64 to match the behavior of fillna)
266
- expected_output_outer = pd.DataFrame(
267
- {
268
- "OBS": pd.to_datetime(
269
- [
270
- "2023-01-01",
271
- "2023-01-02",
272
- "2023-01-03",
273
- "2023-01-04",
274
- "2023-01-05",
275
- ],
276
- ),
277
- "value1": [10.0, 20.0, 30.0, 0.0, 0.0],
278
- "value2": [0.0, 40.0, 50.0, 60.0, 0.0],
279
- "value3": [0.0, 0.0, 70.0, 80.0, 90.0],
280
- },
281
- )
282
-
283
- # Expected output for inner merge
284
- expected_output_inner = pd.DataFrame(
285
- {
286
- "OBS": pd.to_datetime(["2023-01-03"]),
287
- "value1": [30],
288
- "value2": [50],
289
- "value3": [70],
290
- },
291
- )
292
-
293
- # Test outer merge
294
- result_outer = self.dp.merge_dataframes_on_column(
295
- [df1, df2, df3],
296
- common_column="OBS",
297
- merge_how="outer",
298
- )
299
- pd.testing.assert_frame_equal(
300
- result_outer.reset_index(drop=True),
301
- expected_output_outer,
302
- )
303
-
304
- # Test inner merge
305
- result_inner = self.dp.merge_dataframes_on_column(
306
- [df1, df2, df3],
307
- common_column="OBS",
308
- merge_how="inner",
309
- )
310
- pd.testing.assert_frame_equal(
311
- result_inner.reset_index(drop=True),
312
- expected_output_inner,
313
- )
314
-
315
- # Test with empty DataFrame list
316
- result_empty = self.dp.merge_dataframes_on_column(
317
- [],
318
- common_column="OBS",
319
- merge_how="outer",
320
- )
321
- self.assertIsNone(result_empty)
322
-
323
- # Test with one DataFrame in the list
324
- result_single = self.dp.merge_dataframes_on_column(
325
- [df1],
326
- common_column="OBS",
327
- merge_how="outer",
328
- )
329
- pd.testing.assert_frame_equal(result_single.reset_index(drop=True), df1)
330
-
331
- # Test that the common column is sorted and converted to datetime
332
- self.assertTrue(pd.api.types.is_datetime64_any_dtype(result_outer["OBS"]))
333
- self.assertTrue(
334
- (result_outer["OBS"].diff().dropna() >= pd.Timedelta(0)).all(),
335
- ) # Check sorted dates
336
-
337
- def test_merge_and_update_dfs(self):
338
- # Create test DataFrames
339
- df1 = pd.DataFrame(
340
- {
341
- "OBS": ["2023-01-01", "2023-01-02", "2023-01-03"],
342
- "value1": [10, 20, 30],
343
- "value2": [100, 200, 300],
344
- },
345
- )
346
-
347
- df2 = pd.DataFrame(
348
- {
349
- "OBS": ["2023-01-02", "2023-01-03", "2023-01-04"],
350
- "value1": [15, 25, 35], # Updates for value1
351
- "value3": [400, 500, 600], # New column
352
- },
353
- )
354
-
355
- # Ensure test DataFrame columns are datetime
356
- df1["OBS"] = pd.to_datetime(df1["OBS"])
357
- df2["OBS"] = pd.to_datetime(df2["OBS"])
358
-
359
- # Expected output with float64 for numeric columns
360
- expected_output = pd.DataFrame(
361
- {
362
- "OBS": pd.to_datetime(
363
- ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04"],
364
- ),
365
- "value1": [10.0, 15.0, 25.0, 35.0], # Updated where applicable
366
- "value2": [100.0, 200.0, 300.0, 0.0], # From df1, 0 where not available
367
- "value3": [0.0, 400.0, 500.0, 600.0], # From df2, 0 where not available
368
- },
369
- )
370
-
371
- # Test the merge and update function
372
- result = self.dp.merge_and_update_dfs(df1, df2, key_column="OBS")
373
-
374
- # Assertions
375
- pd.testing.assert_frame_equal(result.reset_index(drop=True), expected_output)
376
-
377
- # Test column order is preserved in the result
378
- self.assertListEqual(list(result.columns), list(expected_output.columns))
379
-
380
- # Test that the OBS column is sorted
381
- self.assertTrue((result["OBS"].diff().dropna() >= pd.Timedelta(0)).all())
382
-
383
- def test_convert_us_to_uk_dates(self):
384
- # Create a test DataFrame
385
- test_data = {
386
- "date_col": ["01-02-2023", "03/04/2023", "05-06-2023", "07/08/2023"],
387
- }
388
- df = pd.DataFrame(test_data)
389
-
390
- # Expected output
391
- expected_output = pd.DataFrame(
392
- {
393
- "date_col": pd.to_datetime(
394
- ["2023-01-02", "2023-03-04", "2023-05-06", "2023-07-08"],
395
- ),
396
- },
397
- )
398
-
399
- # Test the conversion function
400
- result = self.dp.convert_us_to_uk_dates(df.copy(), "date_col")
401
-
402
- # Assertions
403
- pd.testing.assert_frame_equal(result, expected_output)
404
-
405
- # Test invalid input formats
406
- invalid_data = pd.DataFrame({"date_col": ["invalid-date", "12345"]})
407
- with self.assertRaises(ValueError):
408
- self.dp.convert_us_to_uk_dates(invalid_data.copy(), "date_col")
409
-
410
- # Test missing values
411
- missing_data = pd.DataFrame({"date_col": [None, "03/04/2023"]})
412
- result_with_missing = self.dp.convert_us_to_uk_dates(
413
- missing_data.copy(),
414
- "date_col",
415
- )
416
- expected_with_missing = pd.DataFrame(
417
- {"date_col": [pd.NaT, pd.to_datetime("2023-03-04")]},
418
- )
419
- pd.testing.assert_frame_equal(result_with_missing, expected_with_missing)
420
-
421
- def test_pivot_table(self):
422
- # Create a test DataFrame
423
- test_data = {
424
- "date": [
425
- "2023-01-01",
426
- "2023-01-01",
427
- "2023-01-02",
428
- "2023-01-02",
429
- "2023-01-03",
430
- ],
431
- "category": ["A", "B", "A", "B", "A"],
432
- "value": [10.0, 20.0, 30.0, 40.0, 50.0],
433
- }
434
- df = pd.DataFrame(test_data)
435
-
436
- # Ensure the 'date' column is in datetime format
437
- df["date"] = pd.to_datetime(df["date"])
438
-
439
- # Expected output for basic pivot table
440
- expected_output_basic = pd.DataFrame(
441
- {
442
- "date": pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03"]),
443
- "A": [10.0, 30.0, 50.0], # Cast to float64
444
- "B": [20.0, 40.0, 0.0], # Cast to float64
445
- },
446
- )
447
- expected_output_basic.columns.name = "category"
448
-
449
- # Test basic pivot table
450
- result_basic = self.dp.pivot_table(
451
- df.copy(),
452
- index_col="date",
453
- columns="category",
454
- values_col="value",
455
- margins=False,
456
- fill_value=0,
457
- )
458
-
459
- # Convert 'date' columns in both DataFrames to datetime for comparison
460
- result_basic["date"] = pd.to_datetime(result_basic["date"])
461
- expected_output_basic["date"] = pd.to_datetime(expected_output_basic["date"])
462
- pd.testing.assert_frame_equal(result_basic, expected_output_basic)
463
-
464
- # Expected output for pivot table with margins
465
- expected_output_with_margins = pd.DataFrame(
466
- {
467
- "date": ["2023-01-01", "2023-01-02", "2023-01-03", "Total"],
468
- "A": [10.0, 30.0, 50.0, 90.0],
469
- "B": [20.0, 40.0, 0.0, 60.0],
470
- "Total": [30.0, 70.0, 50.0, 150.0],
471
- },
472
- )
473
- expected_output_with_margins["date"] = pd.to_datetime(
474
- expected_output_with_margins["date"],
475
- errors="coerce",
476
- ).fillna("Total")
477
- expected_output_with_margins.columns.name = "category"
478
-
479
- # Test pivot table with margins
480
- result_with_margins = self.dp.pivot_table(
481
- df.copy(),
482
- index_col="date",
483
- columns="category",
484
- values_col="value",
485
- margins=True,
486
- fill_value=0,
487
- )
488
- result_with_margins["date"] = pd.to_datetime(
489
- result_with_margins["date"],
490
- errors="coerce",
491
- ).fillna("Total")
492
- pd.testing.assert_frame_equal(result_with_margins, expected_output_with_margins)
493
-
494
- def test_apply_lookup_table_for_columns(self):
495
- # Create a test DataFrame
496
- test_data = {
497
- "col1": ["apple", "banana", "carrot", "date", "eggplant"],
498
- "col2": ["fruit", "fruit", "vegetable", "fruit", "vegetable"],
499
- }
500
- df = pd.DataFrame(test_data)
501
-
502
- # Lookup dictionary
503
- lookup_dict = {
504
- "apple": "Red Fruit",
505
- "banana": "Yellow Fruit",
506
- "carrot": "Orange Vegetable",
507
- "date": "Brown Fruit",
508
- }
509
-
510
- # Expected output with single column lookup
511
- expected_output_single = df.copy()
512
- expected_output_single["Mapping"] = [
513
- "Red Fruit",
514
- "Yellow Fruit",
515
- "Orange Vegetable",
516
- "Brown Fruit",
517
- "Other",
518
- ]
519
-
520
- # Test with a single column
521
- result_single = self.dp.apply_lookup_table_for_columns(
522
- df.copy(),
523
- col_names=["col1"],
524
- to_find_dict=lookup_dict,
525
- )
526
- pd.testing.assert_frame_equal(result_single, expected_output_single)
527
-
528
- # Expected output with multiple column lookup
529
- expected_output_multiple = df.copy()
530
- expected_output_multiple["Mapping"] = [
531
- "Other",
532
- "Other",
533
- "Other",
534
- "Brown Fruit",
535
- "Other",
536
- ]
537
-
538
- # Update lookup dictionary to match merged keys
539
- lookup_dict_merged = {"date|fruit": "Brown Fruit"}
540
-
541
- # Test with multiple columns
542
- result_multiple = self.dp.apply_lookup_table_for_columns(
543
- df.copy(),
544
- col_names=["col1", "col2"],
545
- to_find_dict=lookup_dict_merged,
546
- )
547
- pd.testing.assert_frame_equal(result_multiple, expected_output_multiple)
548
-
549
- # Test case where no match is found
550
- df_no_match = pd.DataFrame({"col1": ["unknown"]})
551
- expected_no_match = df_no_match.copy()
552
- expected_no_match["Mapping"] = ["Other"]
553
- result_no_match = self.dp.apply_lookup_table_for_columns(
554
- df_no_match,
555
- col_names=["col1"],
556
- to_find_dict=lookup_dict,
557
- )
558
- pd.testing.assert_frame_equal(result_no_match, expected_no_match)
559
-
560
- def test_aggregate_daily_to_wc_wide(self):
561
- # Create a test DataFrame
562
- test_data = {
563
- "date": [
564
- "2023-01-01",
565
- "2023-01-02",
566
- "2023-01-08",
567
- "2023-01-09",
568
- "2023-01-10",
569
- ],
570
- "group": ["A", "A", "B", "B", "B"],
571
- "value1": [10, 20, 30, 40, None],
572
- "value2": [100, 200, 300, None, 500],
573
- }
574
- df = pd.DataFrame(test_data)
575
-
576
- # Expected output for weekly aggregation in wide format
577
- expected_output = pd.DataFrame(
578
- {
579
- "OBS": ["2023-01-01", "2023-01-08"], # Weeks starting on Sunday
580
- "value1_A": [30.0, 0.0],
581
- "value1_B": [0.0, 70.0],
582
- "value2_A": [300.0, 0.0],
583
- "value2_B": [0.0, 800.0],
584
- "Total value1": [30.0, 70.0],
585
- "Total value2": [300.0, 800.0],
586
- },
587
- )
588
-
589
- # Test aggregation with totals included
590
- result = self.dp.aggregate_daily_to_wc_wide(
591
- df=df.copy(),
592
- date_column="date",
593
- group_columns=["group"],
594
- sum_columns=["value1", "value2"],
595
- wc="sun",
596
- aggregation="sum",
597
- include_totals=True,
598
- )
599
-
600
- # Ensure 'OBS' columns are datetime for comparison
601
- result["OBS"] = pd.to_datetime(result["OBS"])
602
- expected_output["OBS"] = pd.to_datetime(expected_output["OBS"])
603
-
604
- # Compare the resulting DataFrame with the expected DataFrame
605
- pd.testing.assert_frame_equal(result, expected_output)
606
-
607
- # Test without group columns (no totals, single wide column)
608
- expected_output_no_group = pd.DataFrame(
609
- {
610
- "OBS": ["2023-01-01", "2023-01-08"],
611
- "value1": [30.0, 70.0],
612
- "value2": [300.0, 800.0],
613
- },
614
- )
615
-
616
- result_no_group = self.dp.aggregate_daily_to_wc_wide(
617
- df=df.copy(),
618
- date_column="date",
619
- group_columns=[],
620
- sum_columns=["value1", "value2"],
621
- wc="sun",
622
- aggregation="sum",
623
- include_totals=False,
624
- )
625
-
626
- # Ensure 'OBS' columns are datetime for comparison
627
- result_no_group["OBS"] = pd.to_datetime(result_no_group["OBS"])
628
- expected_output_no_group["OBS"] = pd.to_datetime(
629
- expected_output_no_group["OBS"],
630
- )
631
-
632
- # Compare the resulting DataFrame with the expected DataFrame
633
- pd.testing.assert_frame_equal(result_no_group, expected_output_no_group)
634
-
635
- def test_merge_cols_with_seperator(self):
636
- # Create a test DataFrame
637
- test_data = {
638
- "col1": ["apple", "banana", "cherry"],
639
- "col2": ["red", "yellow", "red"],
640
- "col3": ["fruit", "fruit", "fruit"],
641
- }
642
- df = pd.DataFrame(test_data)
643
-
644
- # Test merging two columns with default separator
645
- expected_output_default = df.copy()
646
- expected_output_default["Merged"] = ["apple_red", "banana_yellow", "cherry_red"]
647
-
648
- result_default = self.dp.merge_cols_with_seperator(
649
- df.copy(),
650
- col_names=["col1", "col2"],
651
- )
652
- pd.testing.assert_frame_equal(result_default, expected_output_default)
653
-
654
- # Test merging three columns with custom separator
655
- expected_output_custom = df.copy()
656
- expected_output_custom["Merged"] = [
657
- "apple-red-fruit",
658
- "banana-yellow-fruit",
659
- "cherry-red-fruit",
660
- ]
661
-
662
- result_custom = self.dp.merge_cols_with_seperator(
663
- df.copy(),
664
- col_names=["col1", "col2", "col3"],
665
- seperator="-",
666
- )
667
- pd.testing.assert_frame_equal(result_custom, expected_output_custom)
668
-
669
- # Test merging with starting and ending prefix
670
- expected_output_prefix = df.copy()
671
- expected_output_prefix["Merged"] = [
672
- "Start:apple_red:End",
673
- "Start:banana_yellow:End",
674
- "Start:cherry_red:End",
675
- ]
676
-
677
- result_prefix = self.dp.merge_cols_with_seperator(
678
- df.copy(),
679
- col_names=["col1", "col2"],
680
- seperator="_",
681
- starting_prefix_str="Start:",
682
- ending_prefix_str=":End",
683
- )
684
- pd.testing.assert_frame_equal(result_prefix, expected_output_prefix)
685
-
686
- # Test error for less than two columns
687
- with self.assertRaises(ValueError):
688
- self.dp.merge_cols_with_seperator(df.copy(), col_names=["col1"])
689
-
690
- def test_check_sum_of_df_cols_are_equal(self):
691
- # Create test DataFrames
692
- df1 = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
693
-
694
- df2 = pd.DataFrame({"colA": [1, 2, 3], "colB": [4, 5, 6]})
695
-
696
- df3 = pd.DataFrame({"colX": [1, 2, 3], "colY": [4, 5, 7]})
697
-
698
- # Test case where sums are equal
699
- result_equal = self.dp.check_sum_of_df_cols_are_equal(
700
- df1,
701
- df2,
702
- cols_1=["col1", "col2"],
703
- cols_2=["colA", "colB"],
704
- )
705
- self.assertEqual(result_equal[0], "They are equal")
706
- self.assertEqual(result_equal[1], 21) # Sum of df1's columns
707
- self.assertEqual(result_equal[2], 21) # Sum of df2's columns
708
-
709
- # Test case where sums are not equal
710
- result_not_equal = self.dp.check_sum_of_df_cols_are_equal(
711
- df1,
712
- df3,
713
- cols_1=["col1", "col2"],
714
- cols_2=["colX", "colY"],
715
- )
716
- self.assertTrue(result_not_equal[0].startswith("They are different by "))
717
- self.assertEqual(result_not_equal[1], 21) # Sum of df1's columns
718
- self.assertEqual(result_not_equal[2], 22) # Sum of df3's columns
719
-
720
- # Test case with mismatched column names
721
- with self.assertRaises(KeyError):
722
- self.dp.check_sum_of_df_cols_are_equal(
723
- df1,
724
- df2,
725
- cols_1=["nonexistent_col"],
726
- cols_2=["colA", "colB"],
727
- )
728
-
729
- # Test case with empty columns
730
- result_empty_cols = self.dp.check_sum_of_df_cols_are_equal(
731
- df1,
732
- df2,
733
- cols_1=[],
734
- cols_2=[],
735
- )
736
- self.assertEqual(result_empty_cols[1], 0) # Sum of empty columns
737
- self.assertEqual(result_empty_cols[2], 0) # Sum of empty columns
738
- self.assertEqual(result_empty_cols[0], "They are equal")
739
-
740
- def test_convert_2_df_cols_to_dict(self):
741
- # Create a test DataFrame
742
- df = pd.DataFrame(
743
- {"key_col": ["key1", "key2", "key3"], "value_col": [10, 20, 30]},
744
- )
745
-
746
- # Expected dictionary
747
- expected_dict = {"key1": 10, "key2": 20, "key3": 30}
748
-
749
- # Test basic functionality
750
- result = self.dp.convert_2_df_cols_to_dict(df, "key_col", "value_col")
751
- self.assertEqual(result, expected_dict)
752
-
753
- # Test with non-unique keys
754
- df_non_unique = pd.DataFrame(
755
- {"key_col": ["key1", "key2", "key1"], "value_col": [10, 20, 30]},
756
- )
757
- expected_dict_non_unique = {
758
- "key1": 30, # Last occurrence of 'key1' should overwrite the earlier one
759
- "key2": 20,
760
- }
761
- result_non_unique = self.dp.convert_2_df_cols_to_dict(
762
- df_non_unique,
763
- "key_col",
764
- "value_col",
765
- )
766
- self.assertEqual(result_non_unique, expected_dict_non_unique)
767
-
768
- # Test with missing key or value column
769
- with self.assertRaises(ValueError):
770
- self.dp.convert_2_df_cols_to_dict(df, "missing_key_col", "value_col")
771
-
772
- with self.assertRaises(ValueError):
773
- self.dp.convert_2_df_cols_to_dict(df, "key_col", "missing_value_col")
774
-
775
- # Test with empty DataFrame
776
- df_empty = pd.DataFrame(columns=["key_col", "value_col"])
777
- expected_empty_dict = {}
778
- result_empty = self.dp.convert_2_df_cols_to_dict(
779
- df_empty,
780
- "key_col",
781
- "value_col",
782
- )
783
- self.assertEqual(result_empty, expected_empty_dict)
784
-
785
- def test_keyword_lookup_replacement(self):
786
- # Create a test DataFrame
787
- test_data = {
788
- "col1": ["A", "B", "C", "D"],
789
- "col2": ["X", "Y", "Z", "W"],
790
- "value_col": ["old_value", "old_value", "unchanged", "old_value"],
791
- }
792
- df = pd.DataFrame(test_data)
793
-
794
- # Lookup dictionary for replacements
795
- lookup_dict = {"A|X": "new_value_1", "B|Y": "new_value_2", "D|W": "new_value_3"}
796
-
797
- # Expected output
798
- expected_output = df.copy()
799
- expected_output["Updated Column"] = [
800
- "new_value_1",
801
- "new_value_2",
802
- "unchanged",
803
- "new_value_3",
804
- ]
805
-
806
- # Apply the function
807
- result = self.dp.keyword_lookup_replacement(
808
- df.copy(),
809
- col="value_col",
810
- replacement_rows="old_value",
811
- cols_to_merge=["col1", "col2"],
812
- replacement_lookup_dict=lookup_dict,
813
- )
814
-
815
- # Compare the resulting DataFrame with the expected DataFrame
816
- pd.testing.assert_frame_equal(result, expected_output)
817
-
818
- # Test case where no replacement is needed
819
- df_no_replacement = pd.DataFrame(
820
- {
821
- "col1": ["E", "F"],
822
- "col2": ["G", "H"],
823
- "value_col": ["unchanged", "unchanged"],
824
- },
825
- )
826
- expected_no_replacement = df_no_replacement.copy()
827
- expected_no_replacement["Updated Column"] = ["unchanged", "unchanged"]
828
-
829
- result_no_replacement = self.dp.keyword_lookup_replacement(
830
- df_no_replacement.copy(),
831
- col="value_col",
832
- replacement_rows="old_value",
833
- cols_to_merge=["col1", "col2"],
834
- replacement_lookup_dict=lookup_dict,
835
- )
836
-
837
- pd.testing.assert_frame_equal(result_no_replacement, expected_no_replacement)
838
-
839
- def test_convert_df_wide_2_long(self):
840
- # Create a test DataFrame
841
- test_data = {
842
- "id": [1, 2, 3],
843
- "name": ["Alice", "Bob", "Charlie"],
844
- "score1": [85, 90, 78],
845
- "score2": [88, 92, 81],
846
- }
847
- df = pd.DataFrame(test_data)
848
-
849
- # Expected output for the transformation
850
- expected_output = pd.DataFrame(
851
- {
852
- "id": [1, 2, 3, 1, 2, 3],
853
- "name": ["Alice", "Bob", "Charlie", "Alice", "Bob", "Charlie"],
854
- "Stacked": ["score1", "score1", "score1", "score2", "score2", "score2"],
855
- "Value": [85, 90, 78, 88, 92, 81],
856
- },
857
- )
858
-
859
- # Apply the function
860
- result = self.dp.convert_df_wide_2_long(
861
- df.copy(),
862
- value_cols=["score1", "score2"],
863
- variable_col_name="Stacked",
864
- value_col_name="Value",
865
- )
866
-
867
- # Compare the resulting DataFrame with the expected DataFrame
868
- pd.testing.assert_frame_equal(result, expected_output)
869
-
870
- # Test case with only one column (should raise ValueError)
871
- with self.assertRaises(ValueError):
872
- self.dp.convert_df_wide_2_long(
873
- df.copy(),
874
- value_cols=["score1"],
875
- variable_col_name="Stacked",
876
- value_col_name="Value",
877
- )
878
-
879
- # Test case with no value columns (should raise ValueError)
880
- with self.assertRaises(ValueError):
881
- self.dp.convert_df_wide_2_long(
882
- df.copy(),
883
- value_cols=[],
884
- variable_col_name="Stacked",
885
- value_col_name="Value",
886
- )
887
-
888
- def test_format_numbers_with_commas(self):
889
- # Create a test DataFrame
890
- test_data = {
891
- "col1": [1000, 2500000, 12345.678, None],
892
- "col2": [2000.5, 350000.75, 0, -12345],
893
- "col3": ["text", "another text", 50000, 123.45],
894
- }
895
- df = pd.DataFrame(test_data).fillna(value=pd.NA) # Normalize None to pd.NA
896
-
897
- # Expected output with 2 decimal places
898
- expected_data = {
899
- "col1": ["1,000.00", "2,500,000.00", "12,345.68", pd.NA],
900
- "col2": ["2,000.50", "350,000.75", "0.00", "-12,345.00"],
901
- "col3": ["text", "another text", "50,000.00", "123.45"],
902
- }
903
- expected_output = pd.DataFrame(expected_data)
904
-
905
- # Apply the function
906
- result = self.dp.format_numbers_with_commas(df, decimal_length_chosen=2)
907
-
908
- # Compare the resulting DataFrame with the expected DataFrame
909
- pd.testing.assert_frame_equal(result, expected_output, check_dtype=False)
910
-
911
- def test_filter_df_on_multiple_conditions(self):
912
- # Create a test DataFrame
913
- test_data = {
914
- "id": [1, 2, 3, 4, 5],
915
- "value": [10, 20, 30, 40, 50],
916
- "category": ["A", "B", "A", "C", "A"],
917
- "date": pd.to_datetime(
918
- ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05"],
919
- ),
920
- }
921
- df = pd.DataFrame(test_data)
922
-
923
- # Test Case 1: Single condition (Equality)
924
- filters_dict = {"category": "== 'A'"}
925
- expected_output = df[df["category"] == "A"]
926
- result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
927
- pd.testing.assert_frame_equal(result, expected_output)
928
-
929
- # Test Case 2: Multiple conditions (Equality and Greater Than)
930
- filters_dict = {"category": "== 'A'", "value": "> 20"}
931
- expected_output = df[(df["category"] == "A") & (df["value"] > 20)]
932
- result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
933
- pd.testing.assert_frame_equal(result, expected_output)
934
-
935
- # Test Case 3: Date comparison
936
- filters_dict = {"date": ">= '2023-01-03'"}
937
- expected_output = df[df["date"] >= pd.to_datetime("2023-01-03")]
938
- result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
939
- pd.testing.assert_frame_equal(result, expected_output)
940
-
941
- # Test Case 4: Inequality
942
- filters_dict = {"value": "!= 30"}
943
- expected_output = df[df["value"] != 30]
944
- result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
945
- pd.testing.assert_frame_equal(result, expected_output)
946
-
947
- # Test Case 5: Mixed conditions
948
- filters_dict = {"category": "== 'A'", "date": "<= '2023-01-03'"}
949
- expected_output = df[
950
- (df["category"] == "A") & (df["date"] <= pd.to_datetime("2023-01-03"))
951
- ]
952
- result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
953
- pd.testing.assert_frame_equal(result, expected_output)
954
-
955
- def test_fill_weekly_date_range(self):
956
- # Test input DataFrame
957
- test_data = {
958
- "date": ["2023-01-02", "2023-01-16", "2023-01-30"], # Weekly data with gaps
959
- "value": [10.0, 20.0, 30.0],
960
- }
961
- df = pd.DataFrame(test_data)
962
- df["date"] = pd.to_datetime(df["date"])
963
-
964
- # Expected output DataFrame
965
- expected_data = {
966
- "date": [
967
- "2023-01-02",
968
- "2023-01-09",
969
- "2023-01-16",
970
- "2023-01-23",
971
- "2023-01-30",
972
- ],
973
- "value": [10.0, 0.0, 20.0, 0.0, 30.0],
974
- }
975
- expected_output = pd.DataFrame(expected_data)
976
- expected_output["date"] = pd.to_datetime(expected_output["date"])
977
-
978
- # Call the function
979
- dp = dataprocessing() # Replace with the correct instantiation of your class
980
- result = dp.fill_weekly_date_range(df, date_column="date", freq="W-MON")
981
-
982
- # Assert the result matches the expected output
983
- pd.testing.assert_frame_equal(
984
- result.reset_index(drop=True),
985
- expected_output.reset_index(drop=True),
986
- )
987
-
988
- def test_add_prefix_and_suffix(self):
989
- # Test DataFrame
990
- test_data = {
991
- "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
992
- "value1": [10, 20, 30],
993
- "value2": [40, 50, 60],
994
- }
995
- df = pd.DataFrame(test_data)
996
-
997
- # Expected output when no date column is excluded
998
- expected_data_no_date_col = {
999
- "prefix_date_suffix": ["2023-01-01", "2023-01-02", "2023-01-03"],
1000
- "prefix_value1_suffix": [10, 20, 30],
1001
- "prefix_value2_suffix": [40, 50, 60],
1002
- }
1003
- expected_output_no_date_col = pd.DataFrame(expected_data_no_date_col)
1004
-
1005
- # Expected output when date column is excluded
1006
- expected_data_with_date_col = {
1007
- "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
1008
- "prefix_value1_suffix": [10, 20, 30],
1009
- "prefix_value2_suffix": [40, 50, 60],
1010
- }
1011
- expected_output_with_date_col = pd.DataFrame(expected_data_with_date_col)
1012
-
1013
- # Call the function without excluding a date column
1014
- dp = dataprocessing() # Replace with the correct instantiation of your class
1015
- result_no_date_col = dp.add_prefix_and_suffix(
1016
- df.copy(),
1017
- prefix="prefix_",
1018
- suffix="_suffix",
1019
- )
1020
-
1021
- # Assert result matches the expected output
1022
- pd.testing.assert_frame_equal(result_no_date_col, expected_output_no_date_col)
1023
-
1024
- # Call the function with a date column excluded
1025
- result_with_date_col = dp.add_prefix_and_suffix(
1026
- df.copy(),
1027
- prefix="prefix_",
1028
- suffix="_suffix",
1029
- date_col="date",
1030
- )
1031
-
1032
- # Assert result matches the expected output
1033
- pd.testing.assert_frame_equal(
1034
- result_with_date_col,
1035
- expected_output_with_date_col,
1036
- )
1037
-
1038
- def test_create_dummies(self):
1039
- # Test Case 1: Basic functionality without date column
1040
- df = pd.DataFrame({"col1": [0, 1, 2], "col2": [3, 4, 0], "col3": [5, 0, 0]})
1041
- dummy_threshold = 1
1042
- expected_output = pd.DataFrame(
1043
- {"col1": [0, 0, 1], "col2": [1, 1, 0], "col3": [1, 0, 0]},
1044
- )
1045
- result = self.dp.create_dummies(df.copy(), dummy_threshold=dummy_threshold)
1046
- pd.testing.assert_frame_equal(result, expected_output)
1047
-
1048
- # Test Case 2: With date column
1049
- df_with_date = pd.DataFrame(
1050
- {
1051
- "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
1052
- "col1": [0, 1, 2],
1053
- "col2": [3, 4, 0],
1054
- },
1055
- )
1056
- expected_output_with_date = pd.DataFrame(
1057
- {
1058
- "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
1059
- "col1": [0, 0, 1],
1060
- "col2": [1, 1, 0],
1061
- },
1062
- )
1063
- result_with_date = self.dp.create_dummies(
1064
- df_with_date.copy(),
1065
- date_col="date",
1066
- dummy_threshold=dummy_threshold,
1067
- )
1068
- pd.testing.assert_frame_equal(result_with_date, expected_output_with_date)
1069
-
1070
- # Test Case 3: Adding total dummy column
1071
- expected_output_with_total = expected_output.copy()
1072
- expected_output_with_total["total"] = [1, 1, 1]
1073
- result_with_total = self.dp.create_dummies(
1074
- df.copy(),
1075
- dummy_threshold=dummy_threshold,
1076
- add_total_dummy_col="Yes",
1077
- )
1078
- pd.testing.assert_frame_equal(result_with_total, expected_output_with_total)
1079
-
1080
- # Test Case 4: Adding total dummy column with date column
1081
- expected_output_with_date_and_total = expected_output_with_date.copy()
1082
- expected_output_with_date_and_total["total"] = [1, 1, 1]
1083
- result_with_date_and_total = self.dp.create_dummies(
1084
- df_with_date.copy(),
1085
- date_col="date",
1086
- dummy_threshold=dummy_threshold,
1087
- add_total_dummy_col="Yes",
1088
- )
1089
- pd.testing.assert_frame_equal(
1090
- result_with_date_and_total,
1091
- expected_output_with_date_and_total,
1092
- )
1093
-
1094
- # Test Case 5: Threshold of 0 (all positive numbers become 1)
1095
- df_threshold_0 = pd.DataFrame({"col1": [-1, 0, 1], "col2": [0, 2, -3]})
1096
- expected_output_threshold_0 = pd.DataFrame(
1097
- {"col1": [0, 0, 1], "col2": [0, 1, 0]},
1098
- )
1099
- result_threshold_0 = self.dp.create_dummies(
1100
- df_threshold_0.copy(),
1101
- dummy_threshold=0,
1102
- )
1103
- pd.testing.assert_frame_equal(result_threshold_0, expected_output_threshold_0)
1104
-
1105
- def test_replace_substrings(self):
1106
- # Test Case 1: Basic replacement
1107
- df = pd.DataFrame(
1108
- {"text": ["hello world", "python programming", "hello python"]},
1109
- )
1110
- replacements = {"hello": "hi", "python": "java"}
1111
- expected_output = pd.DataFrame(
1112
- {"text": ["hi world", "java programming", "hi java"]},
1113
- )
1114
- result = self.dp.replace_substrings(df.copy(), "text", replacements)
1115
- pd.testing.assert_frame_equal(result, expected_output)
1116
-
1117
- # Test Case 2: Replacement with to_lower=True
1118
- df_mixed_case = pd.DataFrame(
1119
- {"text": ["Hello World", "PYTHON Programming", "hello PYTHON"]},
1120
- )
1121
- expected_output_lower = pd.DataFrame(
1122
- {"text": ["hi world", "java programming", "hi java"]},
1123
- )
1124
- result_lower = self.dp.replace_substrings(
1125
- df_mixed_case.copy(),
1126
- "text",
1127
- replacements,
1128
- to_lower=True,
1129
- )
1130
- pd.testing.assert_frame_equal(result_lower, expected_output_lower)
1131
-
1132
- # Test Case 3: Replacement with a new column
1133
- df_new_col = pd.DataFrame(
1134
- {"text": ["hello world", "python programming", "hello python"]},
1135
- )
1136
- expected_output_new_col = pd.DataFrame(
1137
- {
1138
- "text": ["hello world", "python programming", "hello python"],
1139
- "new_text": ["hi world", "java programming", "hi java"],
1140
- },
1141
- )
1142
- result_new_col = self.dp.replace_substrings(
1143
- df_new_col.copy(),
1144
- "text",
1145
- replacements,
1146
- new_column="new_text",
1147
- )
1148
- pd.testing.assert_frame_equal(result_new_col, expected_output_new_col)
1149
-
1150
- def test_add_total_column(self):
1151
- # Test Case 1: Basic functionality without excluding any column
1152
- df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9]})
1153
- expected_output = df.copy()
1154
- expected_output["Total"] = [12, 15, 18]
1155
- result = self.dp.add_total_column(df.copy())
1156
- pd.testing.assert_frame_equal(result, expected_output)
1157
-
1158
- # Test Case 2: Excluding a column from the total
1159
- df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9]})
1160
- expected_output_exclude = pd.DataFrame(
1161
- {
1162
- "col1": [1, 2, 3],
1163
- "col2": [4, 5, 6],
1164
- "col3": [7, 8, 9],
1165
- "Total": [5, 7, 9], # Sum without 'col3'
1166
- },
1167
- )
1168
- result_exclude = self.dp.add_total_column(df.copy(), exclude_col="col3")
1169
- pd.testing.assert_frame_equal(result_exclude, expected_output_exclude)
1170
-
1171
- # Test Case 3: Custom total column name
1172
- custom_total_col_name = "Sum"
1173
- expected_output_custom = df.copy()
1174
- expected_output_custom[custom_total_col_name] = [12, 15, 18]
1175
- result_custom = self.dp.add_total_column(
1176
- df.copy(),
1177
- total_col_name=custom_total_col_name,
1178
- )
1179
- pd.testing.assert_frame_equal(result_custom, expected_output_custom)
1180
-
1181
- # Test Case 4: DataFrame with a single column
1182
- single_col_df = pd.DataFrame({"col1": [1, 2, 3]})
1183
- expected_single_col = single_col_df.copy()
1184
- expected_single_col["Total"] = [1, 2, 3]
1185
- result_single_col = self.dp.add_total_column(single_col_df.copy())
1186
- pd.testing.assert_frame_equal(result_single_col, expected_single_col)
1187
-
1188
- def test_apply_lookup_table_based_on_substring(self):
1189
- # Test Case 1: Basic categorization
1190
- df = pd.DataFrame(
1191
- {
1192
- "text": [
1193
- "I love apples",
1194
- "Bananas are great",
1195
- "Something else",
1196
- "Grapes are sour",
1197
- ],
1198
- },
1199
- )
1200
- category_dict = {
1201
- "apple": "Fruit",
1202
- "banana": "Fruit",
1203
- "cherry": "Fruit",
1204
- "grape": "Fruit",
1205
- }
1206
- expected_output = pd.DataFrame(
1207
- {
1208
- "text": [
1209
- "I love apples",
1210
- "Bananas are great",
1211
- "Something else",
1212
- "Grapes are sour",
1213
- ],
1214
- "Category": ["Fruit", "Fruit", "Other", "Fruit"],
1215
- },
1216
- )
1217
- result = self.dp.apply_lookup_table_based_on_substring(
1218
- df.copy(),
1219
- "text",
1220
- category_dict,
1221
- )
1222
- pd.testing.assert_frame_equal(result, expected_output)
1223
-
1224
- def test_compare_overlap(self):
1225
- """
1226
- Test the compare_overlap function to ensure it calculates differences
1227
- and their totals correctly across overlapping date ranges.
1228
- """
1229
- # 1. Create sample data for df1 (covers 2021-01-01 to 2021-01-04)
1230
- df1_data = [
1231
- {"date": "2021-01-01", "value": 10, "count": 1},
1232
- {"date": "2021-01-02", "value": 15, "count": 2},
1233
- {"date": "2021-01-03", "value": 20, "count": 3},
1234
- {"date": "2021-01-04", "value": 25, "count": 4},
1235
- ]
1236
- df1 = pd.DataFrame(df1_data)
1237
-
1238
- # 2. Create sample data for df2 (covers 2021-01-03 to 2021-01-05)
1239
- df2_data = [
1240
- {"date": "2021-01-03", "value": 22, "count": 2},
1241
- {"date": "2021-01-04", "value": 20, "count": 5},
1242
- {"date": "2021-01-05", "value": 30, "count": 6},
1243
- ]
1244
- df2 = pd.DataFrame(df2_data)
1245
-
1246
- # 3. Call compare_overlap from your dataprocessing class
1247
- diff_df, total_diff_df = self.dp.compare_overlap(df1, df2, "date")
1248
- expected_diff_df = pd.DataFrame(
1249
- {
1250
- "date": pd.to_datetime(["2021-01-03", "2021-01-04"]),
1251
- "diff_value": [-2, 5],
1252
- "diff_count": [1, -1],
1253
- },
1254
- )
1255
-
1256
- expected_total_diff_df = pd.DataFrame(
1257
- {"Column": ["value", "count"], "Total Difference": [3, 0]},
1258
- )
1259
-
1260
- # 5. Use pd.testing.assert_frame_equal to check the outputs
1261
- # Sort and reset index to ensure matching row order
1262
- pd.testing.assert_frame_equal(
1263
- diff_df.sort_values("date").reset_index(drop=True),
1264
- expected_diff_df.sort_values("date").reset_index(drop=True),
1265
- )
1266
-
1267
- # Sort by 'Column' to ensure matching row order in summary
1268
- pd.testing.assert_frame_equal(
1269
- total_diff_df.sort_values("Column").reset_index(drop=True),
1270
- expected_total_diff_df.sort_values("Column").reset_index(drop=True),
1271
- )
1272
-
1273
- def test_week_commencing_2_week_commencing_conversion_isoweekday(self):
1274
- """
1275
- Test the isoweekday-based function to confirm each date is mapped back
1276
- to the 'week_commencing' day of that ISO week.
1277
- """
1278
- # 2023-01-01 was a Sunday; we'll go through Saturday (7 days).
1279
- df = pd.DataFrame({"date": pd.date_range("2023-01-01", periods=7, freq="D")})
1280
- expected_mon = pd.Series(
1281
- [
1282
- pd.Timestamp("2022-12-26"), # Sunday -> previous Monday
1283
- pd.Timestamp("2023-01-02"), # Monday
1284
- pd.Timestamp("2023-01-02"), # Tuesday
1285
- pd.Timestamp("2023-01-02"), # Wednesday
1286
- pd.Timestamp("2023-01-02"), # Thursday
1287
- pd.Timestamp("2023-01-02"), # Friday
1288
- pd.Timestamp("2023-01-02"), # Saturday
1289
- ],
1290
- name="week_start_mon",
1291
- )
1292
-
1293
- # Use the new function from our data processing object
1294
- result = self.dp.week_commencing_2_week_commencing_conversion_isoweekday(
1295
- df.copy(),
1296
- date_col="date",
1297
- week_commencing="mon",
1298
- )
1299
-
1300
- # Compare the 'week_start_mon' column with our expected results
1301
- pd.testing.assert_series_equal(
1302
- result["week_start_mon"], # actual
1303
- expected_mon, # expected
1304
- )
1305
-
1306
-
1307
- ###################################################################################################################################################
1308
- ###################################################################################################################################################
1309
-
1310
- # class TestDataPull(unittest.TestCase)
1311
-
1312
-
1313
- if __name__ == "__main__":
1314
- unittest.main()