imsciences 0.9.5.9__tar.gz → 0.9.6.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/PKG-INFO +1 -1
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences/pull.py +167 -61
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences.egg-info/PKG-INFO +1 -1
- imsciences-0.9.6.3/imsciences.egg-info/PKG-INFO-TomG-HP-290722 +355 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences.egg-info/SOURCES.txt +1 -1
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/setup.py +2 -2
- imsciences-0.9.5.9/imsciences.egg-info/PKG-INFO-IMS-24Ltp-3 +0 -24
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/LICENSE.txt +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/README.md +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences/__init__.py +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences/geo.py +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences/mmm.py +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences/unittesting.py +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences/vis.py +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-0.9.5.9 → imsciences-0.9.6.3}/setup.cfg +0 -0
|
@@ -133,7 +133,7 @@ class datapull:
|
|
|
133
133
|
|
|
134
134
|
Args:
|
|
135
135
|
week_commencing (str): The starting day of the week for aggregation.
|
|
136
|
-
Options are "mon", "tue", "wed", "
|
|
136
|
+
Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
|
|
137
137
|
Default is "mon".
|
|
138
138
|
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
|
|
139
139
|
delay (int): Delay in seconds between retry attempts. Default is 5.
|
|
@@ -144,7 +144,7 @@ class datapull:
|
|
|
144
144
|
and 'macro_boe_intr_rate' contains the average interest rate for the week.
|
|
145
145
|
"""
|
|
146
146
|
# Week commencing dictionary
|
|
147
|
-
day_dict = {"mon": 0, "tue": 1, "wed": 2, "
|
|
147
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
148
148
|
|
|
149
149
|
# URL of the Bank of England data page
|
|
150
150
|
url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
|
|
@@ -209,7 +209,7 @@ class datapull:
|
|
|
209
209
|
Args:
|
|
210
210
|
country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
|
|
211
211
|
week_commencing (str): The starting day of the week for aggregation.
|
|
212
|
-
Options are "mon", "tue", "wed", "
|
|
212
|
+
Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
|
|
213
213
|
start_date (str): Dataset start date in the format "YYYY-MM-DD"
|
|
214
214
|
|
|
215
215
|
Returns:
|
|
@@ -383,7 +383,7 @@ class datapull:
|
|
|
383
383
|
# ---------------------------------------------------------------------
|
|
384
384
|
# 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
|
|
385
385
|
# ---------------------------------------------------------------------
|
|
386
|
-
day_dict = {"mon": 0, "tue": 1, "wed": 2, "
|
|
386
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
387
387
|
|
|
388
388
|
# ---------------------------------------------------------------------
|
|
389
389
|
# 1. Create daily date range from start_date to today
|
|
@@ -668,7 +668,7 @@ class datapull:
|
|
|
668
668
|
raise ValueError("country_codes must be a list/tuple or a single string.")
|
|
669
669
|
|
|
670
670
|
# --- Setup / Constants --- #
|
|
671
|
-
day_dict = {"mon": 0, "tue": 1, "wed": 2, "
|
|
671
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
672
672
|
# Map each 2-letter code to a key
|
|
673
673
|
country_dict = {
|
|
674
674
|
"US": "US_STATES",
|
|
@@ -1171,74 +1171,110 @@ class datapull:
|
|
|
1171
1171
|
|
|
1172
1172
|
def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
|
|
1173
1173
|
"""
|
|
1174
|
-
Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
|
|
1174
|
+
Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
|
|
1175
1175
|
aggregates it to weekly averages, and renames variables based on specified rules.
|
|
1176
1176
|
|
|
1177
1177
|
Parameters:
|
|
1178
|
-
cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
1179
|
-
week_start_day (str): The day the week starts on (
|
|
1180
|
-
sector (str): The sector for which the standard CDIDs are fetched
|
|
1178
|
+
cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
1179
|
+
week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
|
|
1180
|
+
sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
|
|
1181
|
+
(e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
|
|
1181
1182
|
|
|
1182
1183
|
Returns:
|
|
1183
|
-
pd.DataFrame: A DataFrame with weekly frequency, containing
|
|
1184
|
-
|
|
1184
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
|
|
1185
|
+
and all series as renamed columns (e.g., 'macro_retail_sales_uk').
|
|
1186
|
+
Returns an empty DataFrame if no data is fetched or processed.
|
|
1185
1187
|
"""
|
|
1186
1188
|
# Define CDIDs for sectors and defaults
|
|
1187
|
-
|
|
1189
|
+
sector_cdids_map = {
|
|
1188
1190
|
"fast_food": ["L7TD", "L78Q", "DOAD"],
|
|
1191
|
+
"clothing_footwear": ["D7BW","D7GO","CHBJ"],
|
|
1192
|
+
"fuel": ["A9FS","L7FP","CHOL"],
|
|
1193
|
+
"cars":["D7E8","D7E9","D7CO"],
|
|
1189
1194
|
"default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
|
|
1190
1195
|
}
|
|
1191
1196
|
|
|
1192
|
-
default_cdids =
|
|
1193
|
-
sector_specific_cdids =
|
|
1194
|
-
|
|
1197
|
+
default_cdids = sector_cdids_map["default"]
|
|
1198
|
+
sector_specific_cdids = [] # Initialize empty list for sector CDIDs
|
|
1199
|
+
|
|
1200
|
+
if sector: # Check if sector is not None or empty
|
|
1201
|
+
if isinstance(sector, str):
|
|
1202
|
+
# If it's a single string, wrap it in a list
|
|
1203
|
+
sector_list = [sector]
|
|
1204
|
+
elif isinstance(sector, list):
|
|
1205
|
+
# If it's already a list, use it directly
|
|
1206
|
+
sector_list = sector
|
|
1207
|
+
else:
|
|
1208
|
+
raise TypeError("`sector` parameter must be a string or a list of strings.")
|
|
1209
|
+
|
|
1210
|
+
# Iterate through the list of sectors and collect their CDIDs
|
|
1211
|
+
for sec in sector_list:
|
|
1212
|
+
sector_specific_cdids.extend(sector_cdids_map.get(sec, [])) # Use extend to add items from the list
|
|
1213
|
+
|
|
1214
|
+
standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Combine default and selected sector CDIDs, ensure uniqueness
|
|
1195
1215
|
|
|
1196
|
-
# Combine standard CDIDs and additional CDIDs
|
|
1216
|
+
# Combine standard CDIDs and any additional user-provided CDIDs
|
|
1197
1217
|
if cdid_list is None:
|
|
1198
1218
|
cdid_list = []
|
|
1199
|
-
|
|
1219
|
+
final_cdid_list = list(set(standard_cdids + cdid_list)) # Ensure uniqueness in the final list
|
|
1200
1220
|
|
|
1201
1221
|
base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
|
|
1202
1222
|
base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
|
|
1203
1223
|
combined_df = pd.DataFrame()
|
|
1204
1224
|
|
|
1205
1225
|
# Map week start day to pandas weekday convention
|
|
1206
|
-
days_map = {"mon": 0, "tue": 1, "wed": 2, "
|
|
1207
|
-
if week_start_day not in days_map:
|
|
1226
|
+
days_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1227
|
+
if week_start_day.lower() not in days_map:
|
|
1208
1228
|
raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
|
|
1209
|
-
week_start = days_map[week_start_day]
|
|
1229
|
+
week_start = days_map[week_start_day.lower()] # Use lower() for case-insensitivity
|
|
1210
1230
|
|
|
1211
|
-
for cdid in
|
|
1231
|
+
for cdid in final_cdid_list: # Use the final combined list
|
|
1212
1232
|
try:
|
|
1213
1233
|
# Search for the series
|
|
1214
1234
|
search_url = f"{base_search_url}{cdid}"
|
|
1215
|
-
search_response = requests.get(search_url)
|
|
1235
|
+
search_response = requests.get(search_url, timeout=30) # Add timeout
|
|
1216
1236
|
search_response.raise_for_status()
|
|
1217
1237
|
search_data = search_response.json()
|
|
1218
1238
|
|
|
1219
1239
|
items = search_data.get("items", [])
|
|
1220
1240
|
if not items:
|
|
1221
|
-
print(f"No data found for CDID: {cdid}")
|
|
1241
|
+
print(f"Warning: No data found for CDID: {cdid}")
|
|
1222
1242
|
continue
|
|
1223
1243
|
|
|
1224
1244
|
# Extract series name and latest release URI
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1245
|
+
# Find the item with the most recent release_date
|
|
1246
|
+
latest_item = None
|
|
1247
|
+
latest_date = None
|
|
1248
|
+
for item in items:
|
|
1249
|
+
if "release_date" in item:
|
|
1250
|
+
try:
|
|
1251
|
+
# Ensure timezone awareness for comparison
|
|
1252
|
+
current_date = datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
|
|
1253
|
+
if latest_date is None or current_date > latest_date:
|
|
1254
|
+
latest_date = current_date
|
|
1255
|
+
latest_item = item
|
|
1256
|
+
except ValueError:
|
|
1257
|
+
print(f"Warning: Could not parse release_date '{item['release_date']}' for CDID {cdid}")
|
|
1258
|
+
continue # Skip this item if date is invalid
|
|
1259
|
+
|
|
1260
|
+
if latest_item is None:
|
|
1261
|
+
print(f"Warning: No valid release date found for CDID: {cdid}")
|
|
1262
|
+
continue
|
|
1263
|
+
|
|
1264
|
+
series_name = latest_item.get("title", f"Series_{cdid}") # Use title from the latest item
|
|
1265
|
+
latest_uri = latest_item.get("uri")
|
|
1266
|
+
if not latest_uri:
|
|
1267
|
+
print(f"Warning: No URI found for the latest release of CDID: {cdid}")
|
|
1268
|
+
continue
|
|
1234
1269
|
|
|
1235
1270
|
# Fetch the dataset
|
|
1236
1271
|
data_url = f"{base_data_url}{latest_uri}"
|
|
1237
|
-
data_response = requests.get(data_url)
|
|
1272
|
+
data_response = requests.get(data_url, timeout=30) # Add timeout
|
|
1238
1273
|
data_response.raise_for_status()
|
|
1239
1274
|
data_json = data_response.json()
|
|
1240
1275
|
|
|
1241
1276
|
# Detect the frequency and process accordingly
|
|
1277
|
+
frequency_key = None
|
|
1242
1278
|
if "months" in data_json and data_json["months"]:
|
|
1243
1279
|
frequency_key = "months"
|
|
1244
1280
|
elif "quarters" in data_json and data_json["quarters"]:
|
|
@@ -1246,72 +1282,142 @@ class datapull:
|
|
|
1246
1282
|
elif "years" in data_json and data_json["years"]:
|
|
1247
1283
|
frequency_key = "years"
|
|
1248
1284
|
else:
|
|
1249
|
-
print(f"Unsupported frequency or no data for CDID: {cdid}")
|
|
1285
|
+
print(f"Warning: Unsupported frequency or no data values found for CDID: {cdid} at URI {latest_uri}")
|
|
1250
1286
|
continue
|
|
1251
1287
|
|
|
1252
1288
|
# Prepare the DataFrame
|
|
1289
|
+
if not data_json[frequency_key]: # Check if the list of values is empty
|
|
1290
|
+
print(f"Warning: Empty data list for frequency '{frequency_key}' for CDID: {cdid}")
|
|
1291
|
+
continue
|
|
1292
|
+
|
|
1253
1293
|
df = pd.DataFrame(data_json[frequency_key])
|
|
1254
1294
|
|
|
1255
|
-
#
|
|
1256
|
-
if
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
def parse_quarter(quarter_str):
|
|
1260
|
-
year, qtr = quarter_str.split(" Q")
|
|
1261
|
-
month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
|
|
1262
|
-
return datetime(int(year), month, 1)
|
|
1263
|
-
df["date"] = df["date"].apply(parse_quarter)
|
|
1264
|
-
elif frequency_key == "years":
|
|
1265
|
-
df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
|
|
1295
|
+
# Check if essential columns exist
|
|
1296
|
+
if "date" not in df.columns or "value" not in df.columns:
|
|
1297
|
+
print(f"Warning: Missing 'date' or 'value' column for CDID: {cdid}")
|
|
1298
|
+
continue
|
|
1266
1299
|
|
|
1300
|
+
# Parse the 'date' field based on frequency
|
|
1301
|
+
try:
|
|
1302
|
+
if frequency_key == "months":
|
|
1303
|
+
# Handles "YYYY Mon" format (e.g., "2023 FEB") - adjust if format differs
|
|
1304
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
|
|
1305
|
+
elif frequency_key == "quarters":
|
|
1306
|
+
def parse_quarter(quarter_str):
|
|
1307
|
+
try:
|
|
1308
|
+
year, qtr = quarter_str.split(" Q")
|
|
1309
|
+
month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
|
|
1310
|
+
return datetime(int(year), month, 1)
|
|
1311
|
+
except (ValueError, KeyError):
|
|
1312
|
+
return pd.NaT # Return Not a Time for parsing errors
|
|
1313
|
+
df["date"] = df["date"].apply(parse_quarter)
|
|
1314
|
+
elif frequency_key == "years":
|
|
1315
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
|
|
1316
|
+
except Exception as e:
|
|
1317
|
+
print(f"Error parsing date for CDID {cdid} with frequency {frequency_key}: {e}")
|
|
1318
|
+
continue # Skip this series if date parsing fails
|
|
1319
|
+
|
|
1320
|
+
# Coerce value to numeric, handle potential errors
|
|
1267
1321
|
df["value"] = pd.to_numeric(df["value"], errors="coerce")
|
|
1322
|
+
|
|
1323
|
+
# Drop rows where date or value parsing failed
|
|
1324
|
+
df.dropna(subset=["date", "value"], inplace=True)
|
|
1325
|
+
|
|
1326
|
+
if df.empty:
|
|
1327
|
+
print(f"Warning: No valid data points after processing for CDID: {cdid}")
|
|
1328
|
+
continue
|
|
1329
|
+
|
|
1268
1330
|
df.rename(columns={"value": series_name}, inplace=True)
|
|
1269
1331
|
|
|
1270
1332
|
# Combine data
|
|
1271
|
-
|
|
1333
|
+
df_subset = df.loc[:, ["date", series_name]].reset_index(drop=True) # Explicitly select columns
|
|
1272
1334
|
if combined_df.empty:
|
|
1273
|
-
combined_df =
|
|
1335
|
+
combined_df = df_subset
|
|
1274
1336
|
else:
|
|
1275
|
-
|
|
1337
|
+
# Use outer merge to keep all dates, sort afterwards
|
|
1338
|
+
combined_df = pd.merge(combined_df, df_subset, on="date", how="outer")
|
|
1276
1339
|
|
|
1277
1340
|
except requests.exceptions.RequestException as e:
|
|
1278
1341
|
print(f"Error fetching data for CDID {cdid}: {e}")
|
|
1279
|
-
except (KeyError, ValueError) as e:
|
|
1342
|
+
except (KeyError, ValueError, TypeError) as e: # Added TypeError
|
|
1280
1343
|
print(f"Error processing data for CDID {cdid}: {e}")
|
|
1344
|
+
except Exception as e: # Catch unexpected errors
|
|
1345
|
+
print(f"An unexpected error occurred for CDID {cdid}: {e}")
|
|
1346
|
+
|
|
1281
1347
|
|
|
1282
1348
|
if not combined_df.empty:
|
|
1349
|
+
# Sort by date after merging to ensure correct forward fill
|
|
1350
|
+
combined_df.sort_values(by="date", inplace=True)
|
|
1351
|
+
combined_df.reset_index(drop=True, inplace=True)
|
|
1352
|
+
|
|
1353
|
+
# Create a complete daily date range
|
|
1283
1354
|
min_date = combined_df["date"].min()
|
|
1284
|
-
max_date
|
|
1355
|
+
# Ensure max_date is timezone-naive if min_date is, or consistent otherwise
|
|
1356
|
+
max_date = pd.Timestamp(datetime.today().date()) # Use today's date, timezone-naive
|
|
1357
|
+
|
|
1358
|
+
if pd.isna(min_date):
|
|
1359
|
+
print("Error: Minimum date is NaT, cannot create date range.")
|
|
1360
|
+
return pd.DataFrame()
|
|
1361
|
+
|
|
1362
|
+
# Make sure min_date is not NaT before creating the range
|
|
1285
1363
|
date_range = pd.date_range(start=min_date, end=max_date, freq='D')
|
|
1286
1364
|
daily_df = pd.DataFrame(date_range, columns=['date'])
|
|
1365
|
+
|
|
1366
|
+
# Merge with original data and forward fill
|
|
1287
1367
|
daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
|
|
1288
1368
|
daily_df = daily_df.ffill()
|
|
1289
1369
|
|
|
1370
|
+
# Drop rows before the first valid data point after ffill
|
|
1371
|
+
first_valid_index = daily_df.dropna(subset=daily_df.columns.difference(['date'])).index.min()
|
|
1372
|
+
if pd.notna(first_valid_index):
|
|
1373
|
+
daily_df = daily_df.loc[first_valid_index:]
|
|
1374
|
+
else:
|
|
1375
|
+
print("Warning: No valid data points found after forward filling.")
|
|
1376
|
+
return pd.DataFrame() # Return empty if ffill results in no data
|
|
1377
|
+
|
|
1378
|
+
|
|
1290
1379
|
# Aggregate to weekly frequency
|
|
1291
|
-
|
|
1380
|
+
# Ensure 'date' column is datetime type before dt accessor
|
|
1381
|
+
daily_df['date'] = pd.to_datetime(daily_df['date'])
|
|
1382
|
+
daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start + 7) % 7, unit='D') # Corrected logic for week start
|
|
1383
|
+
# Group by week_commencing and calculate mean for numeric columns only
|
|
1292
1384
|
weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
|
|
1293
1385
|
|
|
1386
|
+
|
|
1294
1387
|
def clean_column_name(name):
|
|
1388
|
+
# Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
|
|
1295
1389
|
name = re.sub(r"\(.*?\)", "", name)
|
|
1390
|
+
# Take only the part before the first colon if present
|
|
1296
1391
|
name = re.split(r":", name)[0]
|
|
1297
|
-
|
|
1392
|
+
# Remove digits
|
|
1393
|
+
#name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
|
|
1394
|
+
# Remove specific words like 'annual', 'rate' case-insensitively
|
|
1298
1395
|
name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
|
|
1396
|
+
# Remove non-alphanumeric characters (except underscore and space)
|
|
1299
1397
|
name = re.sub(r"[^\w\s]", "", name)
|
|
1398
|
+
# Replace spaces with underscores
|
|
1399
|
+
name = name.strip() # Remove leading/trailing whitespace
|
|
1300
1400
|
name = name.replace(" ", "_")
|
|
1401
|
+
# Replace multiple underscores with a single one
|
|
1301
1402
|
name = re.sub(r"_+", "_", name)
|
|
1403
|
+
# Remove trailing underscores
|
|
1302
1404
|
name = name.rstrip("_")
|
|
1405
|
+
# Add prefix and suffix
|
|
1303
1406
|
return f"macro_{name.lower()}_uk"
|
|
1304
1407
|
|
|
1408
|
+
# Apply cleaning function to relevant columns
|
|
1305
1409
|
weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
|
|
1306
|
-
weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
|
|
1410
|
+
weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True) # Rename week commencing col
|
|
1307
1411
|
|
|
1308
|
-
|
|
1412
|
+
# Optional: Fill remaining NaNs (e.g., at the beginning if ffill didn't cover) with 0
|
|
1413
|
+
# Consider if 0 is the appropriate fill value for your use case
|
|
1414
|
+
# weekly_df = weekly_df.fillna(0)
|
|
1309
1415
|
|
|
1310
1416
|
return weekly_df
|
|
1311
1417
|
else:
|
|
1312
|
-
print("No data
|
|
1418
|
+
print("No data successfully fetched or processed.")
|
|
1313
1419
|
return pd.DataFrame()
|
|
1314
|
-
|
|
1420
|
+
|
|
1315
1421
|
def pull_yfinance(self, tickers=None, week_start_day="mon"):
|
|
1316
1422
|
"""
|
|
1317
1423
|
Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
|
|
@@ -1337,7 +1443,7 @@ class datapull:
|
|
|
1337
1443
|
end_date = datetime.today().strftime("%Y-%m-%d")
|
|
1338
1444
|
|
|
1339
1445
|
# Mapping week start day to pandas weekday convention
|
|
1340
|
-
days_map = {"mon": 0, "tue": 1, "wed": 2, "
|
|
1446
|
+
days_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1341
1447
|
if week_start_day not in days_map:
|
|
1342
1448
|
raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
|
|
1343
1449
|
week_start = days_map[week_start_day]
|
|
@@ -1497,9 +1603,9 @@ class datapull:
|
|
|
1497
1603
|
# Aggregate by week commencing
|
|
1498
1604
|
day_offsets = {
|
|
1499
1605
|
'mon': 'W-MON',
|
|
1500
|
-
'
|
|
1606
|
+
'tue': 'W-TUE',
|
|
1501
1607
|
'wed': 'W-WED',
|
|
1502
|
-
'
|
|
1608
|
+
'thu': 'W-THU',
|
|
1503
1609
|
'fri': 'W-FRI',
|
|
1504
1610
|
'sat': 'W-SAT',
|
|
1505
1611
|
'sun': 'W-SUN'
|
|
@@ -1592,9 +1698,9 @@ class datapull:
|
|
|
1592
1698
|
# Resample by week
|
|
1593
1699
|
day_offsets = {
|
|
1594
1700
|
'mon': 'W-MON',
|
|
1595
|
-
'
|
|
1701
|
+
'tue': 'W-TUE',
|
|
1596
1702
|
'wed': 'W-WED',
|
|
1597
|
-
'
|
|
1703
|
+
'thu': 'W-THU',
|
|
1598
1704
|
'fri': 'W-FRI',
|
|
1599
1705
|
'sat': 'W-SAT',
|
|
1600
1706
|
'sun': 'W-SUN'
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: imsciences
|
|
3
|
+
Version: 0.9.6.3
|
|
4
|
+
Summary: IMS Data Processing Package
|
|
5
|
+
Author: IMS
|
|
6
|
+
Author-email: cam@im-sciences.com
|
|
7
|
+
Keywords: data processing,apis,data analysis,data visualization,machine learning
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Operating System :: Unix
|
|
12
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
13
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE.txt
|
|
16
|
+
Requires-Dist: pandas
|
|
17
|
+
Requires-Dist: plotly
|
|
18
|
+
Requires-Dist: numpy
|
|
19
|
+
Requires-Dist: fredapi
|
|
20
|
+
Requires-Dist: xgboost
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
22
|
+
Requires-Dist: bs4
|
|
23
|
+
Requires-Dist: yfinance
|
|
24
|
+
Requires-Dist: holidays
|
|
25
|
+
Requires-Dist: google-analytics-data
|
|
26
|
+
Requires-Dist: geopandas
|
|
27
|
+
Requires-Dist: geopy
|
|
28
|
+
|
|
29
|
+
# IMS Package Documentation
|
|
30
|
+
|
|
31
|
+
The **Independent Marketing Sciences** package is a Python library designed to process incoming data into a format tailored for projects, particularly those utilising weekly time series data. This package offers a suite of functions for efficient data collection, manipulation, visualisation and analysis.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Key Features
|
|
36
|
+
- Seamless data processing for time series workflows.
|
|
37
|
+
- Aggregation, filtering, and transformation of time series data.
|
|
38
|
+
- Visualising Data
|
|
39
|
+
- Integration with external data sources like FRED, Bank of England and ONS.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
Table of Contents
|
|
44
|
+
=================
|
|
45
|
+
|
|
46
|
+
1. [Usage](#usage)
|
|
47
|
+
2. [Data Processing for Time Series](#data-processing-for-time-series)
|
|
48
|
+
3. [Data Processing for Incrementality Testing](#data-processing-for-incrementality-testing)
|
|
49
|
+
4. [Data Visualisations](#data-visualisations)
|
|
50
|
+
5. [Data Pulling](#data-pulling)
|
|
51
|
+
6. [Installation](#installation)
|
|
52
|
+
7. [License](#license)
|
|
53
|
+
8. [Roadmap](#roadmap)
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
from imsciences import dataprocessing, geoprocessing, datapull, datavis
|
|
61
|
+
ims_proc = dataprocessing()
|
|
62
|
+
ims_geo = geoprocessing()
|
|
63
|
+
ims_pull = datapull()
|
|
64
|
+
ims_vis = datavis()
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Data Processing for Time Series
|
|
68
|
+
|
|
69
|
+
## 1. `get_wd_levels`
|
|
70
|
+
- **Description**: Get the working directory with the option of moving up parents.
|
|
71
|
+
- **Usage**: `get_wd_levels(levels)`
|
|
72
|
+
- **Example**: `get_wd_levels(0)`
|
|
73
|
+
|
|
74
|
+
## 2. `aggregate_daily_to_wc_long`
|
|
75
|
+
- **Description**: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.
|
|
76
|
+
- **Usage**: `aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')`
|
|
77
|
+
- **Example**: `aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')`
|
|
78
|
+
|
|
79
|
+
## 3. `convert_monthly_to_daily`
|
|
80
|
+
- **Description**: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.
|
|
81
|
+
- **Usage**: `convert_monthly_to_daily(df, date_column, divide=True)`
|
|
82
|
+
- **Example**: `convert_monthly_to_daily(df, 'date')`
|
|
83
|
+
|
|
84
|
+
## 4. `week_of_year_mapping`
|
|
85
|
+
- **Description**: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.
|
|
86
|
+
- **Usage**: `week_of_year_mapping(df, week_col, start_day_str)`
|
|
87
|
+
- **Example**: `week_of_year_mapping(df, 'week', 'mon')`
|
|
88
|
+
|
|
89
|
+
## 5. `rename_cols`
|
|
90
|
+
- **Description**: Renames columns in a pandas DataFrame with a specified prefix or format.
|
|
91
|
+
- **Usage**: `rename_cols(df, name='ame_')`
|
|
92
|
+
- **Example**: `rename_cols(df, 'ame_facebook')`
|
|
93
|
+
|
|
94
|
+
## 6. `merge_new_and_old`
|
|
95
|
+
- **Description**: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.
|
|
96
|
+
- **Usage**: `merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')`
|
|
97
|
+
- **Example**: `merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')`
|
|
98
|
+
|
|
99
|
+
## 7. `merge_dataframes_on_column`
|
|
100
|
+
- **Description**: Merge a list of DataFrames on a common column.
|
|
101
|
+
- **Usage**: `merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')`
|
|
102
|
+
- **Example**: `merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')`
|
|
103
|
+
|
|
104
|
+
## 8. `merge_and_update_dfs`
|
|
105
|
+
- **Description**: Merges two dataframes, updating columns from the second dataframe where values are available.
|
|
106
|
+
- **Usage**: `merge_and_update_dfs(df1, df2, key_column)`
|
|
107
|
+
- **Example**: `merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')`
|
|
108
|
+
|
|
109
|
+
## 9. `convert_us_to_uk_dates`
|
|
110
|
+
- **Description**: Convert a DataFrame column with mixed US and UK date formats to datetime.
|
|
111
|
+
- **Usage**: `convert_us_to_uk_dates(df, date_col)`
|
|
112
|
+
- **Example**: `convert_us_to_uk_dates(df, 'date')`
|
|
113
|
+
|
|
114
|
+
## 10. `combine_sheets`
|
|
115
|
+
- **Description**: Combines multiple DataFrames from a dictionary into a single DataFrame.
|
|
116
|
+
- **Usage**: `combine_sheets(all_sheets)`
|
|
117
|
+
- **Example**: `combine_sheets({'Sheet1': df1, 'Sheet2': df2})`
|
|
118
|
+
|
|
119
|
+
## 11. `pivot_table`
|
|
120
|
+
- **Description**: Dynamically pivots a DataFrame based on specified columns.
|
|
121
|
+
- **Usage**: `pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')`
|
|
122
|
+
- **Example**: `pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)`
|
|
123
|
+
|
|
124
|
+
## 12. `apply_lookup_table_for_columns`
|
|
125
|
+
- **Description**: Maps substrings in columns to new values based on a dictionary.
|
|
126
|
+
- **Usage**: `apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')`
|
|
127
|
+
- **Example**: `apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')`
|
|
128
|
+
|
|
129
|
+
## 13. `aggregate_daily_to_wc_wide`
|
|
130
|
+
- **Description**: Aggregates daily data into weekly data and pivots it to wide format.
|
|
131
|
+
- **Usage**: `aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)`
|
|
132
|
+
- **Example**: `aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)`
|
|
133
|
+
|
|
134
|
+
## 14. `merge_cols_with_seperator`
|
|
135
|
+
- **Description**: Merges multiple columns in a DataFrame into one column with a specified separator.
|
|
136
|
+
- **Usage**: `merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')`
|
|
137
|
+
- **Example**: `merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')`
|
|
138
|
+
|
|
139
|
+
## 15. `check_sum_of_df_cols_are_equal`
|
|
140
|
+
- **Description**: Checks if the sum of two columns in two DataFrames are equal and provides the difference.
|
|
141
|
+
- **Usage**: `check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)`
|
|
142
|
+
- **Example**: `check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')`
|
|
143
|
+
|
|
144
|
+
## 16. `convert_2_df_cols_to_dict`
|
|
145
|
+
- **Description**: Creates a dictionary from two DataFrame columns.
|
|
146
|
+
- **Usage**: `convert_2_df_cols_to_dict(df, key_col, value_col)`
|
|
147
|
+
- **Example**: `convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')`
|
|
148
|
+
|
|
149
|
+
## 17. `create_FY_and_H_columns`
|
|
150
|
+
- **Description**: Adds financial year and half-year columns to a DataFrame based on a start date.
|
|
151
|
+
- **Usage**: `create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')`
|
|
152
|
+
- **Example**: `create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')`
|
|
153
|
+
|
|
154
|
+
## 18. `keyword_lookup_replacement`
|
|
155
|
+
- **Description**: Updates values in a column based on a lookup dictionary with conditional logic.
|
|
156
|
+
- **Usage**: `keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')`
|
|
157
|
+
- **Example**: `keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')`
|
|
158
|
+
|
|
159
|
+
## 19. `create_new_version_of_col_using_LUT`
|
|
160
|
+
- **Description**: Creates a new column based on a lookup table applied to an existing column.
|
|
161
|
+
- **Usage**: `create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')`
|
|
162
|
+
- **Example**: `create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)`
|
|
163
|
+
|
|
164
|
+
## 20. `convert_df_wide_2_long`
|
|
165
|
+
- **Description**: Converts a wide-format DataFrame into a long-format DataFrame.
|
|
166
|
+
- **Usage**: `convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')`
|
|
167
|
+
- **Example**: `convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')`
|
|
168
|
+
|
|
169
|
+
## 21. `manually_edit_data`
|
|
170
|
+
- **Description**: Manually updates specified cells in a DataFrame based on filters.
|
|
171
|
+
- **Usage**: `manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)`
|
|
172
|
+
- **Example**: `manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')`
|
|
173
|
+
|
|
174
|
+
## 22. `format_numbers_with_commas`
|
|
175
|
+
- **Description**: Formats numerical columns with commas and a specified number of decimal places.
|
|
176
|
+
- **Usage**: `format_numbers_with_commas(df, decimal_length_chosen=2)`
|
|
177
|
+
- **Example**: `format_numbers_with_commas(df, decimal_length_chosen=1)`
|
|
178
|
+
|
|
179
|
+
## 23. `filter_df_on_multiple_conditions`
|
|
180
|
+
- **Description**: Filters a DataFrame based on multiple column conditions.
|
|
181
|
+
- **Usage**: `filter_df_on_multiple_conditions(df, filters_dict)`
|
|
182
|
+
- **Example**: `filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})`
|
|
183
|
+
|
|
184
|
+
## 24. `read_and_concatenate_files`
|
|
185
|
+
- **Description**: Reads and concatenates files from a specified folder into a single DataFrame.
|
|
186
|
+
- **Usage**: `read_and_concatenate_files(folder_path, file_type='csv')`
|
|
187
|
+
- **Example**: `read_and_concatenate_files('/path/to/files', file_type='xlsx')`
|
|
188
|
+
|
|
189
|
+
## 25. `upgrade_outdated_packages`
|
|
190
|
+
- **Description**: Upgrades all outdated Python packages except specified ones.
|
|
191
|
+
- **Usage**: `upgrade_outdated_packages(exclude_packages=['twine'])`
|
|
192
|
+
- **Example**: `upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])`
|
|
193
|
+
|
|
194
|
+
## 26. `convert_mixed_formats_dates`
|
|
195
|
+
- **Description**: Converts mixed-format date columns into standardized datetime format.
|
|
196
|
+
- **Usage**: `convert_mixed_formats_dates(df, column_name)`
|
|
197
|
+
- **Example**: `convert_mixed_formats_dates(df, 'date_col')`
|
|
198
|
+
|
|
199
|
+
## 27. `fill_weekly_date_range`
|
|
200
|
+
- **Description**: Fills in missing weekly dates in a DataFrame with a specified frequency.
|
|
201
|
+
- **Usage**: `fill_weekly_date_range(df, date_column, freq='W-MON')`
|
|
202
|
+
- **Example**: `fill_weekly_date_range(df, 'date_col')`
|
|
203
|
+
|
|
204
|
+
## 28. `add_prefix_and_suffix`
|
|
205
|
+
- **Description**: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.
|
|
206
|
+
- **Usage**: `add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)`
|
|
207
|
+
- **Example**: `add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')`
|
|
208
|
+
|
|
209
|
+
## 29. `create_dummies`
|
|
210
|
+
- **Description**: Creates dummy variables for columns, with an option to add a total dummy column.
|
|
211
|
+
- **Usage**: `create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')`
|
|
212
|
+
- **Example**: `create_dummies(df, date_col='date_col', dummy_threshold=1)`
|
|
213
|
+
|
|
214
|
+
## 30. `replace_substrings`
|
|
215
|
+
- **Description**: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.
|
|
216
|
+
- **Usage**: `replace_substrings(df, column, replacements, to_lower=False, new_column=None)`
|
|
217
|
+
- **Example**: `replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')`
|
|
218
|
+
|
|
219
|
+
## 31. `add_total_column`
|
|
220
|
+
- **Description**: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.
|
|
221
|
+
- **Usage**: `add_total_column(df, exclude_col=None, total_col_name='Total')`
|
|
222
|
+
- **Example**: `add_total_column(df, exclude_col='date_col')`
|
|
223
|
+
|
|
224
|
+
## 32. `apply_lookup_table_based_on_substring`
|
|
225
|
+
- **Description**: Categorizes text in a column using a lookup table based on substrings.
|
|
226
|
+
- **Usage**: `apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')`
|
|
227
|
+
- **Example**: `apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})`
|
|
228
|
+
|
|
229
|
+
## 33. `compare_overlap`
|
|
230
|
+
- **Description**: Compares overlapping periods between two DataFrames and summarizes differences.
|
|
231
|
+
- **Usage**: `compare_overlap(df1, df2, date_col)`
|
|
232
|
+
- **Example**: `compare_overlap(df1, df2, 'date_col')`
|
|
233
|
+
|
|
234
|
+
## 34. `week_commencing_2_week_commencing_conversion_isoweekday`
|
|
235
|
+
- **Description**: Maps dates to the start of the current ISO week based on a specified weekday.
|
|
236
|
+
- **Usage**: `week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')`
|
|
237
|
+
- **Example**: `week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')`
|
|
238
|
+
|
|
239
|
+
## 35. `seasonality_feature_extraction`
|
|
240
|
+
- **Description**: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.
|
|
241
|
+
- **Usage**: `seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)`
|
|
242
|
+
- **Example**: `seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)`
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## Data Processing for Incrementality Testing
|
|
247
|
+
|
|
248
|
+
## 1. `pull_ga`
|
|
249
|
+
- **Description**: Pull in GA4 data for geo experiments.
|
|
250
|
+
- **Usage**: `pull_ga(credentials_file, property_id, start_date, country, metrics)`
|
|
251
|
+
- **Example**: `pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])`
|
|
252
|
+
|
|
253
|
+
## 2. `process_itv_analysis`
|
|
254
|
+
- **Description**: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.
|
|
255
|
+
- **Usage**: `process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
|
|
256
|
+
- **Example**: `process_itv_analysis(df, 'itv regional mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum'])`
|
|
257
|
+
|
|
258
|
+
## 3. `process_city_analysis`
|
|
259
|
+
- **Description**: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.
|
|
260
|
+
- **Usage**: `process_city_analysis(raw_df, spend_df, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
|
|
261
|
+
- **Example**: `process_city_analysis(df, spend, output, ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'sum'])`
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## Data Visualisations
|
|
266
|
+
|
|
267
|
+
## 1. `plot_one`
|
|
268
|
+
- **Description**: Plots a specified column from a DataFrame with white background and black axes.
|
|
269
|
+
- **Usage**: `plot_one(df1, col1, date_column)`
|
|
270
|
+
- **Example**: `plot_one(df, 'sales', 'date')`
|
|
271
|
+
|
|
272
|
+
## 2. `plot_two`
|
|
273
|
+
- **Description**: Plots specified columns from two DataFrames, optionally on the same or separate y-axes.
|
|
274
|
+
- **Usage**: `plot_two(df1, col1, df2, col2, date_column, same_axis=True)`
|
|
275
|
+
- **Example**: `plot_two(df1, 'sales', df2, 'revenue', 'date', same_axis=False)`
|
|
276
|
+
|
|
277
|
+
## 3. `plot_chart`
|
|
278
|
+
- **Description**: Plots various chart types using Plotly, including line, bar, scatter, area, pie, etc.
|
|
279
|
+
- **Usage**: `plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values')`
|
|
280
|
+
- **Example**: `plot_chart(df, 'date', ['sales', 'revenue'], chart_type='line', title='Sales and Revenue')`
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Data Pulling
|
|
285
|
+
|
|
286
|
+
## 1. `pull_fred_data`
|
|
287
|
+
- **Description**: Fetch data from FRED using series ID tokens.
|
|
288
|
+
- **Usage**: `pull_fred_data(week_commencing, series_id_list)`
|
|
289
|
+
- **Example**: `pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])`
|
|
290
|
+
|
|
291
|
+
## 2. `pull_boe_data`
|
|
292
|
+
- **Description**: Fetch and process Bank of England interest rate data.
|
|
293
|
+
- **Usage**: `pull_boe_data(week_commencing)`
|
|
294
|
+
- **Example**: `pull_boe_data('mon')`
|
|
295
|
+
|
|
296
|
+
## 3. `pull_oecd`
|
|
297
|
+
- **Description**: Fetch macroeconomic data from OECD for a specified country.
|
|
298
|
+
- **Usage**: `pull_oecd(country='GBR', week_commencing='mon', start_date='2020-01-01')`
|
|
299
|
+
- **Example**: `pull_oecd('GBR', 'mon', '2000-01-01')`
|
|
300
|
+
|
|
301
|
+
## 4. `get_google_mobility_data`
|
|
302
|
+
- **Description**: Fetch Google Mobility data for the specified country.
|
|
303
|
+
- **Usage**: `get_google_mobility_data(country, wc)`
|
|
304
|
+
- **Example**: `get_google_mobility_data('United Kingdom', 'mon')`
|
|
305
|
+
|
|
306
|
+
## 5. `pull_seasonality`
|
|
307
|
+
- **Description**: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.
|
|
308
|
+
- **Usage**: `pull_seasonality(week_commencing, start_date, countries)`
|
|
309
|
+
- **Example**: `pull_seasonality('mon', '2020-01-01', ['US', 'GB'])`
|
|
310
|
+
|
|
311
|
+
## 6. `pull_weather`
|
|
312
|
+
- **Description**: Fetch and process historical weather data for the specified country.
|
|
313
|
+
- **Usage**: `pull_weather(week_commencing, start_date, country)`
|
|
314
|
+
- **Example**: `pull_weather('mon', '2020-01-01', 'GBR')`
|
|
315
|
+
|
|
316
|
+
## 7. `pull_macro_ons_uk`
|
|
317
|
+
- **Description**: Fetch and process time series data from the Beta ONS API.
|
|
318
|
+
- **Usage**: `pull_macro_ons_uk(additional_list, week_commencing, sector)`
|
|
319
|
+
- **Example**: `pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')`
|
|
320
|
+
|
|
321
|
+
## 8. `pull_yfinance`
|
|
322
|
+
- **Description**: Fetch and process time series data from Yahoo Finance.
|
|
323
|
+
- **Usage**: `pull_yfinance(tickers, week_start_day)`
|
|
324
|
+
- **Example**: `pull_yfinance(['^FTMC', '^IXIC'], 'mon')`
|
|
325
|
+
|
|
326
|
+
## 9. `pull_sports_events`
|
|
327
|
+
- **Description**: Pull a veriety of sports events primaraly football and rugby.
|
|
328
|
+
- **Usage**: `pull_sports_events(start_date, week_commencing)`
|
|
329
|
+
- **Example**: `pull_sports_events('2020-01-01', 'mon')`
|
|
330
|
+
|
|
331
|
+
---
|
|
332
|
+
|
|
333
|
+
## Installation
|
|
334
|
+
|
|
335
|
+
Install the IMS package via pip:
|
|
336
|
+
|
|
337
|
+
```bash
|
|
338
|
+
pip install imsciences
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
---
|
|
342
|
+
|
|
343
|
+
## License
|
|
344
|
+
|
|
345
|
+
This project is licensed under the MIT License. 
|
|
346
|
+
|
|
347
|
+
---
|
|
348
|
+
|
|
349
|
+
## Roadmap
|
|
350
|
+
|
|
351
|
+
- [Fixes]: Naming conventions are inconsistent/ have changed from previous seasonality tools (eg. 'seas_nyd' is named 'seas_new_years_day', 'week_1' is named 'seas_1')
|
|
352
|
+
- [Fixes]: Naming conventions can be inconsistent within the data pull (suffix on some var is 'gb' on some it is 'uk' and for others there is no suffix) - furthermore, there is a lack of consistency for global holidays/events (Christmas, Easter, Halloween, etc) - some have regional suffix and others don't.
|
|
353
|
+
- [Additions]: Need to add new data pulls for more macro and seasonal varibles
|
|
354
|
+
|
|
355
|
+
---
|
|
@@ -8,7 +8,7 @@ imsciences/pull.py
|
|
|
8
8
|
imsciences/unittesting.py
|
|
9
9
|
imsciences/vis.py
|
|
10
10
|
imsciences.egg-info/PKG-INFO
|
|
11
|
-
imsciences.egg-info/PKG-INFO-
|
|
11
|
+
imsciences.egg-info/PKG-INFO-TomG-HP-290722
|
|
12
12
|
imsciences.egg-info/SOURCES.txt
|
|
13
13
|
imsciences.egg-info/dependency_links.txt
|
|
14
14
|
imsciences.egg-info/requires.txt
|
|
@@ -8,7 +8,7 @@ def read_md(file_name):
|
|
|
8
8
|
return f.read()
|
|
9
9
|
return ''
|
|
10
10
|
|
|
11
|
-
VERSION = '0.9.
|
|
11
|
+
VERSION = '0.9.6.3'
|
|
12
12
|
DESCRIPTION = 'IMS Data Processing Package'
|
|
13
13
|
LONG_DESCRIPTION = read_md('README.md')
|
|
14
14
|
|
|
@@ -34,5 +34,5 @@ setup(
|
|
|
34
34
|
"Operating System :: Unix",
|
|
35
35
|
"Operating System :: MacOS :: MacOS X",
|
|
36
36
|
"Operating System :: Microsoft :: Windows",
|
|
37
|
-
]
|
|
37
|
+
],
|
|
38
38
|
)
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: imsciences
|
|
3
|
-
Version: 0.6.1.1
|
|
4
|
-
Summary: IMS Data Processing Package
|
|
5
|
-
Author: IMS
|
|
6
|
-
Author-email: cam@im-sciences.com
|
|
7
|
-
Keywords: python,data processing
|
|
8
|
-
Classifier: Development Status :: 3 - Alpha
|
|
9
|
-
Classifier: Intended Audience :: Developers
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Operating System :: Unix
|
|
12
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
|
13
|
-
Classifier: Operating System :: Microsoft :: Windows
|
|
14
|
-
Description-Content-Type: text/markdown
|
|
15
|
-
Requires-Dist: pandas
|
|
16
|
-
|
|
17
|
-
# IMS Package Documentation
|
|
18
|
-
|
|
19
|
-
The IMS package is a python library for processing incoming data into a format that can be used for projects. IMS processing offers a variety of functions to manipulate and analyze data efficiently. Here are the functionalities provided by the package:
|
|
20
|
-
|
|
21
|
-
## Data Processing
|
|
22
|
-
|
|
23
|
-
## Data Pulling
|
|
24
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|