imsciences 1.0.5__tar.gz → 1.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of imsciences might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 1.0.5
3
+ Version: 1.0.6
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -2397,13 +2397,13 @@ class datapull:
2397
2397
  cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
2398
2398
  week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
2399
2399
  sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
2400
- (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
2400
+ (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
2401
2401
 
2402
2402
  Returns
2403
2403
  -------
2404
2404
  pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
2405
- and all series as renamed columns (e.g., 'macro_retail_sales_uk').
2406
- Returns an empty DataFrame if no data is fetched or processed.
2405
+ and all series as renamed columns (e.g., 'macro_retail_sales_uk').
2406
+ Returns an empty DataFrame if no data is fetched or processed.
2407
2407
 
2408
2408
  """
2409
2409
  # Define CDIDs for sectors and defaults
@@ -2436,16 +2436,11 @@ class datapull:
2436
2436
  sector_cdids_map.get(sec, []),
2437
2437
  ) # Use extend to add items from the list
2438
2438
 
2439
- standard_cdids = list(
2440
- set(default_cdids + sector_specific_cdids),
2441
- ) # Combine default and selected sector CDIDs, ensure uniqueness
2442
-
2443
2439
  # Combine standard CDIDs and any additional user-provided CDIDs
2440
+ standard_cdids = list(dict.fromkeys(default_cdids + sector_specific_cdids))
2444
2441
  if cdid_list is None:
2445
2442
  cdid_list = []
2446
- final_cdid_list = list(
2447
- set(standard_cdids + cdid_list),
2448
- ) # Ensure uniqueness in the final list
2443
+ final_cdid_list = list(dict.fromkeys(standard_cdids + cdid_list))
2449
2444
 
2450
2445
  base_search_url = (
2451
2446
  "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
@@ -2670,26 +2665,59 @@ class datapull:
2670
2665
  )
2671
2666
 
2672
2667
  def clean_column_name(name):
2673
- # Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
2668
+ # Remove content within parentheses
2674
2669
  name = re.sub(r"\(.*?\)", "", name)
2675
- # Take only the part before the first colon if present
2676
- name = re.split(r":", name)[0]
2677
- # Remove digits
2678
- # name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
2679
- # Remove specific words like 'annual', 'rate' case-insensitively
2680
- name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
2670
+
2671
+ # Special handling for ANY CPI items (not just CPI INDEX)
2672
+ if "CPI" in name.upper():
2673
+ # Extract the description part after the colon for CPI items
2674
+ if ":" in name:
2675
+ parts = name.split(":")
2676
+ if len(parts) >= 2:
2677
+ # Take the description part (usually the second part)
2678
+ description = parts[1].strip()
2679
+ # Remove any remaining colons and everything after
2680
+ description = description.split(":")[0].strip()
2681
+ name = f"CPI {description}"
2682
+
2683
+ # Remove numbers and dots for ALL CPI items (like 00, 06.2.2, 12.5.3/5)
2684
+ name = re.sub(r"\d+\.?\d*/?\.?\d*", "", name)
2685
+
2686
+ else:
2687
+ # For non-CPI items, take only the part before the first colon
2688
+ name = re.split(r":", name)[0]
2689
+ # Remove all digits for non-CPI items too
2690
+ name = re.sub(r"\d+", "", name)
2691
+
2692
+ # Remove year references like "2015=100"
2693
+ name = re.sub(r"\d{4}=\d+", "", name)
2694
+
2695
+ # Remove specific words case-insensitively
2696
+ name = re.sub(r"\b(annual|rate|index|seasonally|adjusted|sa|cvm)\b", "", name, flags=re.IGNORECASE)
2697
+
2698
+ # Remove percentage symbols and "%"
2699
+ name = re.sub(r"%", "percent", name)
2700
+
2681
2701
  # Remove non-alphanumeric characters (except underscore and space)
2682
2702
  name = re.sub(r"[^\w\s]", "", name)
2703
+
2683
2704
  # Replace spaces with underscores
2684
- name = name.strip() # Remove leading/trailing whitespace
2685
- name = name.replace(" ", "_")
2705
+ name = name.strip().replace(" ", "_")
2706
+
2686
2707
  # Replace multiple underscores with a single one
2687
2708
  name = re.sub(r"_+", "_", name)
2688
- # Remove trailing underscores
2689
- name = name.rstrip("_")
2690
- # Add prefix and suffix
2709
+
2710
+ # Remove leading/trailing underscores
2711
+ name = name.strip("_")
2712
+
2713
+ # Truncate very long names (optional)
2714
+ if len(name) > 50:
2715
+ words = name.split("_")
2716
+ # Keep first few meaningful words
2717
+ name = "_".join(words[:4])
2718
+
2691
2719
  return f"macro_{name.lower()}_uk"
2692
-
2720
+
2693
2721
  # Apply cleaning function to relevant columns
2694
2722
  weekly_df.columns = [
2695
2723
  clean_column_name(col) if col != "week_commencing" else col
@@ -2704,14 +2732,16 @@ class datapull:
2704
2732
  # Consider if 0 is the appropriate fill value for your use case
2705
2733
  # weekly_df = weekly_df.fillna(0)
2706
2734
 
2707
- # Create new column names, keeping "OBS" unchanged
2708
- new_columns = []
2709
- for col in weekly_df.columns:
2710
- if col == "OBS":
2711
- new_columns.append(col)
2735
+ # Get only the data columns (excluding OBS)
2736
+ data_columns = [col for col in weekly_df.columns if col != "OBS"]
2737
+
2738
+ new_columns = ["OBS"]
2739
+ for i, col in enumerate(data_columns):
2740
+ if i < len(final_cdid_list):
2741
+ new_columns.append(f"{col}_{final_cdid_list[i]}")
2712
2742
  else:
2713
- new_columns.append(f"{col}_{cdid}")
2714
-
2743
+ new_columns.append(col) # Keep original if no matching CDID
2744
+
2715
2745
  # Apply the new column names to the DataFrame
2716
2746
  weekly_df.columns = new_columns
2717
2747
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 1.0.5
3
+ Version: 1.0.6
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -21,7 +21,7 @@ def get_version():
21
21
  for line in f:
22
22
  if line.startswith("__version__"):
23
23
  return line.split("=")[1].strip().strip('"').strip("'")
24
- return "1.0.4" # Start from 1.0.0 instead of 0.0.0
24
+ return "1.0.5" # Start from 1.0.0 instead of 0.0.0
25
25
 
26
26
 
27
27
  def increment_version():
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes