PyPI - geocif - Versions diffs - 0.1.80__tar.gz → 0.1.82__tar.gz - Mend

geocif 0.1.80tar.gz → 0.1.82tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

{geocif-0.1.80/geocif.egg-info → geocif-0.1.82}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.80
+Version: 0.1.82
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.80 → geocif-0.1.82}/geocif/cei/indices.py RENAMED Viewed

@@ -158,9 +158,14 @@ def adjust_dataframes(df: pd.DataFrame) -> pd.DataFrame:
         earliest_year = df["time"].dt.year.min()
         desired_start_year = earliest_year + 1
         desired_start_date_dynamic = pd.Timestamp(f"{desired_start_year}-01-01")
+        # Calculate the difference between the earliest date in the dataset and the desired start date
         min_date_new = df["time"].min()
         date_difference_dynamic = desired_start_date_dynamic - min_date_new
+        # Adjust all dates in the 'time' column forward by the calculated difference
         df["time"] = df["time"] + date_difference_dynamic
     return df
@@ -214,8 +219,9 @@ def get_icclim_dates(
     # end_br: latest date - 2 years
     end_br = str(df_all_years_ix.index[-1][2] - relativedelta(years=2))
-    start_tr = np.datetime_as_string(df_harvest_year_ix.index[0][2])
-    end_tr = np.datetime_as_string(df_harvest_year_ix.index[-1][2])
+    start_tr = np.datetime_as_string(df_harvest_year_ix.index[0][2].to_datetime64())
+    end_tr = np.datetime_as_string(df_harvest_year_ix.index[-1][2].to_datetime64())
     return start_br, end_br, start_tr, end_tr
@@ -244,8 +250,11 @@ def compute_indices(
         df_time_period = adjust_dataframes(df_time_period)
         df_base_period = adjust_dataframes(df_base_period)
-    dx, vals_ix = df_to_xarray(df_base_period)
-    start_br, end_br, start_tr, end_tr = get_icclim_dates(vals_ix, df_time_period.set_index(["lat", "lon", "time"]))
+    try:
+        dx, vals_ix = df_to_xarray(df_base_period)
+        start_br, end_br, start_tr, end_tr = get_icclim_dates(vals_ix, df_time_period.set_index(["lat", "lon", "time"]))
+    except:
+        breakpoint()
     # For seasonal indices, slice_mode is used, but for SPI indices it fails
     slice_mode = (
@@ -277,6 +286,7 @@ def compute_indices(
             "Error computing %s for %s to %s: %s",
             index_name, start_tr, end_tr, e
         )
+        breakpoint()
     return ds
@@ -319,11 +329,11 @@ METHOD_TO_COLUMN = {
     "full_season": "crop_cal",
     "fraction_season": "fraction_season",
     "dekad": "dekad",
-    "dekad_r": "dekad",
+    "dekad_r": "dekad_r",
     "biweekly": "biweekly",
-    "biweekly_r": "biweekly",
+    "biweekly_r": "biweekly_r",
     "monthly": "monthly",
-    "monthly_r": "monthly"
+    "monthly_r": "monthly_r"
 }
@@ -545,7 +555,7 @@ class CEIs:
         if not col:
             raise ValueError(f"Unknown method: {self.method}")
-        stages = sorted(df[col].unique())
+        stages = df[col].unique()
         valid_stages = None
         if self.method == "phenological_stages":
@@ -636,22 +646,24 @@ class CEIs:
             )
             # 1) ICCLIM-based indices
-            for index_name, (index_type, index_details) in di.dict_indices.items():
-                ds = compute_indices(df_time_period, df_base_period, index_name)
-                if ds:
-                    df_out = ds.to_dataframe().reset_index()
-                    df_processed = self.process_row(
-                        df_out,
-                        df_harvest_year_region,
-                        extended_stage,
-                        key,
-                        index_name,
-                        index_type,
-                        index_details
-                    )
-                    if not df_processed.empty:
-                        frames_group.append(df_processed)
+            try:
+                for index_name, (index_type, index_details) in di.dict_indices.items():
+                    ds = compute_indices(df_time_period, df_base_period, index_name)
+                    if ds:
+                        df_out = ds.to_dataframe().reset_index()
+                        df_processed = self.process_row(
+                            df_out,
+                            df_harvest_year_region,
+                            extended_stage,
+                            key,
+                            index_name,
+                            index_type,
+                            index_details
+                        )
+                        if not df_processed.empty:
+                            frames_group.append(df_processed)
+            except:
+                breakpoint()
             # 2) EO indices (NDVI, ESI, GCVI, H-INDEX, etc.)
             for eo_var in ["GCVI", "NDVI", "ESI4WK", "H-INDEX"]:
                 df_eo = self.compute_eo_indices(df_time_period, df_harvest_year_region, eo_var, key, extended_stage)

{geocif-0.1.80 → geocif-0.1.82}/geocif/geocif.py RENAMED Viewed

@@ -945,11 +945,12 @@ class Geocif:
         parts = all_cei_columns[-1].split("_")
         cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
-        # For each region, find the column with the longest string in cei_column
-        group_by = ["Region"]
-        groups = df.groupby(group_by)
         if self.use_cumulative_features:
             frames = []
+            # For each region, find the column with the longest string in cei_column
+            group_by = ["Region"]
+            groups = df.groupby(group_by)
             for name, group in groups:
                 # Drop columns with all NaNs
                 group.dropna(axis=1, how="all", inplace=True)
@@ -1019,26 +1020,45 @@ class Geocif:
             # Drop those columns
             df = df.drop(columns=cols_to_drop)
+        from collections import Counter
+        esi_cols = df.filter(like="AUC_ESI4WK").columns.tolist()
+        dupes = {k: v for k, v in Counter(esi_cols).items() if v > 1}
+        print("<0>", dupes)
         # Hack: If
         # Change column name
         # e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
         df = stages.update_feature_names(df, self.method)
+        from collections import Counter
+        esi_cols = df.filter(like="AUC_ESI4WK").columns.tolist()
+        dupes = {k: v for k, v in Counter(esi_cols).items() if v > 1}
+        print("<111>", dupes)
         all_cei_columns = self.get_cei_column_names(df)
         # Fill in any missing values with 0
         df.loc[:, all_cei_columns].fillna(0, inplace=True)
+        from collections import Counter
+        esi_cols = df.filter(like="AUC_ESI4WK").columns.tolist()
+        dupes = {k: v for k, v in Counter(esi_cols).items() if v > 1}
+        print("<1>", dupes)
         df = fe.compute_last_year_yield(df, self.target)
+        from collections import Counter
+        esi_cols = df.filter(like="AUC_ESI4WK").columns.tolist()
+        dupes = {k: v for k, v in Counter(esi_cols).items() if v > 1}
+        print("<2>", dupes)
         df = fe.compute_median_statistics(
             df, self.all_seasons_with_yield, self.number_median_years, self.target
         )
+        from collections import Counter
+        esi_cols = df.filter(like="AUC_ESI4WK").columns.tolist()
+        dupes = {k: v for k, v in Counter(esi_cols).items() if v > 1}
+        print("<3>", dupes)
         df = fe.compute_user_median_statistics(df, range(2018, 2023))
         df = fe.compute_user_median_statistics(df, range(2013, 2018))
+        from collections import Counter
+        esi_cols = df.filter(like="AUC_ESI4WK").columns.tolist()
+        dupes = {k: v for k, v in Counter(esi_cols).items() if v > 1}
+        print("<4>", dupes)
         if self.median_area_as_feature:
             df = fe.compute_median_statistics(
                 df, self.all_seasons_with_yield, self.number_median_years, "Area (ha)"
@@ -1053,7 +1073,10 @@ class Geocif:
             df = fe.compute_analogous_yield(
                 df, self.all_seasons_with_yield, self.number_median_years, self.target
             )
+        from collections import Counter
+        esi_cols = df.filter(like="AUC_ESI4WK").columns.tolist()
+        dupes = {k: v for k, v in Counter(esi_cols).items() if v > 1}
+        print("5", dupes)
         # Create Region_ID column based on Region column category code
         df["Region"] = df["Region"].astype("category")
         if self.cluster_strategy == "single":
@@ -1067,7 +1090,7 @@ class Geocif:
             # Region_ID should be type category
             df["Region_ID"] = df["Region_ID"].astype("category")
+        breakpoint()
         return df
     def execute(self):
@@ -1474,7 +1497,8 @@ class Geocif:
             assert all_files, f"No files found in {_dir_country} with {file_name}"
             self.df_inputs = pd.concat(
-                (pd.read_csv(f) for f in all_files), ignore_index=True
+                (pd.read_csv(f, engine="pyarrow") for f in tqdm(all_files, desc="Reading CSVs", leave=False)),
+                ignore_index=True
             )
             self.df_inputs = stats.add_statistics(
@@ -1486,7 +1510,9 @@ class Geocif:
                 [self.target] + self.statistics_columns,
                 self.method,
             )
             """ Add information on starting and ending time period for each stage"""
+            self.logger.info("Adding starting and ending time period for each stage")
             self.df_inputs = stages.add_stage_information(self.df_inputs, self.method)
             self.df_inputs.to_csv(file, index=False)

{geocif-0.1.80 → geocif-0.1.82}/geocif/indices_runner.py RENAMED Viewed

@@ -165,7 +165,7 @@ class cei_runner(base.BaseGeo):
         combinations = [
             i
             for i in combinations
-            if "ukraine_maize" in i[3]
+            if "ukraine" in i[3]
             # or "lesotho_maize" in i[3] or
             # #   "namibia_" in i[2] or
             # "united_republic_of_tanzania_maize" in i[3]
@@ -179,7 +179,7 @@ class cei_runner(base.BaseGeo):
         ]
         #                 "malawi" in i[2]]
-        if True or self.do_parallel:
+        if False and self.do_parallel:
             num_cpu = int(cpu_count() * 0.6)
             with Pool(num_cpu) as p:
                 for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):

{geocif-0.1.80 → geocif-0.1.82}/geocif/ml/embedding.py RENAMED Viewed

@@ -32,7 +32,10 @@ def _compute_correlations(X, y):
         f_series = X[feature]
         # Ignore NaN values in either y or f_series
-        mask = ~(np.isnan(y) | np.isnan(f_series))
+        try:
+            mask = ~(np.isnan(y) | np.isnan(f_series))
+        except:
+            breakpoint()
         y_filtered = y[mask]
         f_series_filtered = f_series[mask]

{geocif-0.1.80 → geocif-0.1.82}/geocif/ml/stages.py RENAMED Viewed

@@ -268,6 +268,11 @@ def update_feature_names(df, method):
         # Saving the result in the dictionary
         stages_info[element] = (cei, start_stage, end_stage, new_column_name)
+        # Check if any duplicates exist in the dictionary
+        if len(stages_info) != len(set(stages_info.values())):
+            breakpoint()
+            raise ValueError(f"Duplicate stage information found for {element}")
+    breakpoint()
     # For each column in df, check if it exists in stages_info, and
     # replace it with the new column name
     # Precompute the rename mapping outside the loop

{geocif-0.1.80 → geocif-0.1.82/geocif.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geocif
-Version: 0.1.80
+Version: 0.1.82
 Summary: Models to visualize and forecast crop conditions and yields
 Home-page: https://ritviksahajpal.github.io/yield_forecasting/
 Author: Ritvik Sahajpal

{geocif-0.1.80 → geocif-0.1.82}/setup.py RENAMED Viewed

@@ -50,6 +50,6 @@ setup(
     test_suite="tests",
     tests_require=test_requirements,
     url="https://ritviksahajpal.github.io/yield_forecasting/",
-    version="0.1.80",
+    version="0.1.82",
     zip_safe=False,
 )