PyPI - imsciences - Versions diffs - 1.0.5__tar.gz → 1.0.6__tar.gz - Mend

imsciences 1.0.5tar.gz → 1.0.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of imsciences might be problematic. Click here for more details.

Files changed (17) hide show

{imsciences-1.0.5 → imsciences-1.0.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 1.0.5
+Version: 1.0.6
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

{imsciences-1.0.5 → imsciences-1.0.6}/imsciences/pull.py RENAMED Viewed

@@ -2397,13 +2397,13 @@ class datapull:
             cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
             week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
             sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
-                                             (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
+                                                (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
         Returns
         -------
             pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
-                          and all series as renamed columns (e.g., 'macro_retail_sales_uk').
-                          Returns an empty DataFrame if no data is fetched or processed.
+                            and all series as renamed columns (e.g., 'macro_retail_sales_uk').
+                            Returns an empty DataFrame if no data is fetched or processed.
         """
         # Define CDIDs for sectors and defaults
@@ -2436,16 +2436,11 @@ class datapull:
                     sector_cdids_map.get(sec, []),
                 )  # Use extend to add items from the list
-        standard_cdids = list(
-            set(default_cdids + sector_specific_cdids),
-        )  # Combine default and selected sector CDIDs, ensure uniqueness
         # Combine standard CDIDs and any additional user-provided CDIDs
+        standard_cdids = list(dict.fromkeys(default_cdids + sector_specific_cdids))
         if cdid_list is None:
             cdid_list = []
-        final_cdid_list = list(
-            set(standard_cdids + cdid_list),
-        )  # Ensure uniqueness in the final list
+        final_cdid_list = list(dict.fromkeys(standard_cdids + cdid_list))
         base_search_url = (
             "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
@@ -2670,26 +2665,59 @@ class datapull:
             )
             def clean_column_name(name):
-                # Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
+                # Remove content within parentheses
                 name = re.sub(r"\(.*?\)", "", name)
-                # Take only the part before the first colon if present
-                name = re.split(r":", name)[0]
-                # Remove digits
-                # name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
-                # Remove specific words like 'annual', 'rate' case-insensitively
-                name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
+                # Special handling for ANY CPI items (not just CPI INDEX)
+                if "CPI" in name.upper():
+                    # Extract the description part after the colon for CPI items
+                    if ":" in name:
+                        parts = name.split(":")
+                        if len(parts) >= 2:
+                            # Take the description part (usually the second part)
+                            description = parts[1].strip()
+                            # Remove any remaining colons and everything after
+                            description = description.split(":")[0].strip()
+                            name = f"CPI {description}"
+                    # Remove numbers and dots for ALL CPI items (like 00, 06.2.2, 12.5.3/5)
+                    name = re.sub(r"\d+\.?\d*/?\.?\d*", "", name)
+                else:
+                    # For non-CPI items, take only the part before the first colon
+                    name = re.split(r":", name)[0]
+                    # Remove all digits for non-CPI items too
+                    name = re.sub(r"\d+", "", name)
+                # Remove year references like "2015=100"
+                name = re.sub(r"\d{4}=\d+", "", name)
+                # Remove specific words case-insensitively
+                name = re.sub(r"\b(annual|rate|index|seasonally|adjusted|sa|cvm)\b", "", name, flags=re.IGNORECASE)
+                # Remove percentage symbols and "%"
+                name = re.sub(r"%", "percent", name)
                 # Remove non-alphanumeric characters (except underscore and space)
                 name = re.sub(r"[^\w\s]", "", name)
                 # Replace spaces with underscores
-                name = name.strip()  # Remove leading/trailing whitespace
-                name = name.replace(" ", "_")
+                name = name.strip().replace(" ", "_")
                 # Replace multiple underscores with a single one
                 name = re.sub(r"_+", "_", name)
-                # Remove trailing underscores
-                name = name.rstrip("_")
-                # Add prefix and suffix
+                # Remove leading/trailing underscores
+                name = name.strip("_")
+                # Truncate very long names (optional)
+                if len(name) > 50:
+                    words = name.split("_")
+                    # Keep first few meaningful words
+                    name = "_".join(words[:4])
                 return f"macro_{name.lower()}_uk"
             # Apply cleaning function to relevant columns
             weekly_df.columns = [
                 clean_column_name(col) if col != "week_commencing" else col
@@ -2704,14 +2732,16 @@ class datapull:
             # Consider if 0 is the appropriate fill value for your use case
             # weekly_df = weekly_df.fillna(0)
-            # Create new column names, keeping "OBS" unchanged
-            new_columns = []
-            for col in weekly_df.columns:
-                if col == "OBS":
-                    new_columns.append(col)
+            # Get only the data columns (excluding OBS)
+            data_columns = [col for col in weekly_df.columns if col != "OBS"]
+            new_columns = ["OBS"]
+            for i, col in enumerate(data_columns):
+                if i < len(final_cdid_list):
+                    new_columns.append(f"{col}_{final_cdid_list[i]}")
                 else:
-                    new_columns.append(f"{col}_{cdid}")
+                    new_columns.append(col)  # Keep original if no matching CDID
             # Apply the new column names to the DataFrame
             weekly_df.columns = new_columns

{imsciences-1.0.5 → imsciences-1.0.6}/imsciences.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 1.0.5
+Version: 1.0.6
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

{imsciences-1.0.5 → imsciences-1.0.6}/setup.py RENAMED Viewed

@@ -21,7 +21,7 @@ def get_version():
         for line in f:
             if line.startswith("__version__"):
                 return line.split("=")[1].strip().strip('"').strip("'")
-    return "1.0.4"  # Start from 1.0.0 instead of 0.0.0
+    return "1.0.5"  # Start from 1.0.0 instead of 0.0.0
 def increment_version():