PyPI - chronos-forecasting - Versions diffs - 2.2.0__tar.gz → 2.2.0rc2__tar.gz - Mend

chronos-forecasting 2.2.0tar.gz → 2.2.0rc2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

{chronos_forecasting-2.2.0 → chronos_forecasting-2.2.0rc2}/.gitignore RENAMED Viewed

@@ -160,6 +160,4 @@ cython_debug/
 #.idea/
 # macOS stuff
-.DS_store
-chronos-2-finetuned
+.DS_store

{chronos_forecasting-2.2.0 → chronos_forecasting-2.2.0rc2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chronos-forecasting
-Version: 2.2.0
+Version: 2.2.0rc2
 Summary: Chronos: Pretrained models for time series forecasting
 Project-URL: Homepage, https://github.com/amazon-science/chronos-forecasting
 Project-URL: Issues, https://github.com/amazon-science/chronos-forecasting/issues

{chronos_forecasting-2.2.0 → chronos_forecasting-2.2.0rc2}/scripts/evaluation/evaluate.py RENAMED Viewed

@@ -295,7 +295,7 @@ def chronos_2(
     device: str = "cuda",
     torch_dtype: str = "float32",
     batch_size: int = 32,
-    cross_learning: bool = False,
+    predict_batches_jointly: bool = False,
 ):
     """Evaluate Chronos-2 models.
@@ -316,7 +316,7 @@ def chronos_2(
     batch_size : int, optional, default = 32
         Batch size for inference. For Chronos-Bolt models, significantly larger
         batch sizes can be used
-    cross_learning: bool, optional, default = False
+    predict_batches_jointly: bool, optional, default = False
         If True, cross-learning is enables and model makes joint predictions for all
         items in the batch
     """
@@ -335,7 +335,7 @@ def chronos_2(
         metrics_path=metrics_path,
         model_id=model_id,
         batch_size=batch_size,
-        cross_learning=cross_learning,
+        predict_batches_jointly=predict_batches_jointly,
     )

chronos_forecasting-2.2.0rc2/src/chronos/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "2.2.0rc2"

{chronos_forecasting-2.2.0 → chronos_forecasting-2.2.0rc2}/src/chronos/base.py RENAMED Viewed

@@ -141,7 +141,6 @@ class BaseChronosPipeline(metaclass=PipelineRegistry):
         target: str = "target",
         prediction_length: int | None = None,
         quantile_levels: list[float] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
-        validate_inputs: bool = True,
         **predict_kwargs,
     ) -> "pd.DataFrame":
         """
@@ -163,9 +162,6 @@ class BaseChronosPipeline(metaclass=PipelineRegistry):
             Number of steps to predict for each time series
         quantile_levels
             Quantile levels to compute
-        validate_inputs
-            When True, the dataframe(s) will be validated before prediction, ensuring that timestamps have a
-            regular frequency, and item IDs match between past and future data. Setting to False disables these checks.
         **predict_kwargs
             Additional arguments passed to predict_quantiles
@@ -200,7 +196,6 @@ class BaseChronosPipeline(metaclass=PipelineRegistry):
             timestamp_column=timestamp_column,
             target_columns=[target],
             prediction_length=prediction_length,
-            validate_inputs=validate_inputs,
         )
         # NOTE: any covariates, if present, are ignored here

{chronos_forecasting-2.2.0 → chronos_forecasting-2.2.0rc2}/src/chronos/chronos2/pipeline.py RENAMED Viewed

@@ -19,6 +19,7 @@ from transformers import AutoConfig
 from transformers.utils.import_utils import is_peft_available
 from transformers.utils.peft_utils import find_adapter_config_file
 import chronos.chronos2
 from chronos.base import BaseChronosPipeline, ForecastType
 from chronos.chronos2 import Chronos2Model
@@ -113,7 +114,6 @@ class Chronos2Pipeline(BaseChronosPipeline):
         min_past: int | None = None,
         finetuned_ckpt_name: str = "finetuned-ckpt",
         callbacks: list["TrainerCallback"] | None = None,
-        remove_printer_callback: bool = False,
         **extra_trainer_kwargs,
     ) -> "Chronos2Pipeline":
         """
@@ -156,8 +156,6 @@ class Chronos2Pipeline(BaseChronosPipeline):
             The name of the directory inside `output_dir` in which the final fine-tuned checkpoint will be saved, by default "finetuned-ckpt"
         callbacks
             A list of `TrainerCallback`s which will be forwarded to the HuggingFace `Trainer`
-        remove_printer_callback
-            If True, all instances of `PrinterCallback` are removed from callbacks
         **extra_trainer_kwargs
             Extra kwargs are directly forwarded to `TrainingArguments`
@@ -167,7 +165,6 @@ class Chronos2Pipeline(BaseChronosPipeline):
         """
         import torch.cuda
-        from transformers.trainer_callback import PrinterCallback
         from transformers.training_args import TrainingArguments
         if finetune_mode == "lora":
@@ -178,7 +175,6 @@ class Chronos2Pipeline(BaseChronosPipeline):
                     "`peft` is required for `finetune_mode='lora'`. Please install it with `pip install peft`. Falling back to `finetune_mode='full'`."
                 )
                 finetune_mode = "full"
-                lora_config = None
         from chronos.chronos2.trainer import Chronos2Trainer, EvaluateAndSaveFinalStepCallback
@@ -269,7 +265,7 @@ class Chronos2Pipeline(BaseChronosPipeline):
             report_to="none",
             max_steps=num_steps,
             gradient_accumulation_steps=1,
-            dataloader_num_workers=0,
+            dataloader_num_workers=1,
             tf32=has_sm80 and not use_cpu,
             bf16=has_sm80 and not use_cpu,
             save_only_model=True,
@@ -326,19 +322,12 @@ class Chronos2Pipeline(BaseChronosPipeline):
             eval_dataset=eval_dataset,
             callbacks=callbacks,
         )
-        if remove_printer_callback:
-            trainer.pop_callback(PrinterCallback)
         trainer.train()
-        # update context_length and max_output_patches, if the model was fine-tuned with larger values
-        model.chronos_config.context_length = max(model.chronos_config.context_length, context_length)
+        # update max_output_patches, if the model was fine-tuned with longer prediction_length
         model.chronos_config.max_output_patches = max(
             model.chronos_config.max_output_patches, math.ceil(prediction_length / self.model_output_patch_size)
         )
-        # update chronos_config in model's config, so it is saved correctly
-        model.config.chronos_config = model.chronos_config.__dict__
         # Create a new pipeline with the fine-tuned model
         finetuned_pipeline = Chronos2Pipeline(model=model)
@@ -447,7 +436,7 @@ class Chronos2Pipeline(BaseChronosPipeline):
         prediction_length: int | None = None,
         batch_size: int = 256,
         context_length: int | None = None,
-        cross_learning: bool = False,
+        predict_batches_jointly: bool = False,
         limit_prediction_length: bool = False,
         **kwargs,
     ) -> list[torch.Tensor]:
@@ -533,7 +522,7 @@ class Chronos2Pipeline(BaseChronosPipeline):
             will be lower than this value, by default 256
         context_length
             The maximum context length used during for inference, by default set to the model's default context length
-        cross_learning
+        predict_batches_jointly
             If True, cross-learning is enabled, i.e., all the tasks in `inputs` will be predicted jointly and the model will share information across all inputs, by default False
             The following must be noted when using cross-learning:
             - Cross-learning doesn't always improve forecast accuracy and must be tested for individual use cases.
@@ -553,14 +542,6 @@ class Chronos2Pipeline(BaseChronosPipeline):
         if prediction_length is None:
             prediction_length = model_prediction_length
-        if kwargs.get("predict_batches_jointly") is not None:
-            warnings.warn(
-                "The `predict_batches_jointly` argument is deprecated and will be removed in a future version. "
-                "Please use `cross_learning=True` to enable the cross-learning mode.",
-                category=FutureWarning,
-                stacklevel=2,
-            )
-            cross_learning = kwargs.pop("predict_batches_jointly")
         # The maximum number of output patches to generate in a single forward pass before the long-horizon heuristic kicks in. Note: A value larger
         # than the model's default max_output_patches may lead to degradation in forecast accuracy, defaults to a model-specific value
         max_output_patches = kwargs.pop("max_output_patches", self.max_output_patches)
@@ -618,7 +599,7 @@ class Chronos2Pipeline(BaseChronosPipeline):
             batch_future_covariates = batch["future_covariates"]
             batch_target_idx_ranges = batch["target_idx_ranges"]
-            if cross_learning:
+            if predict_batches_jointly:
                 batch_group_ids = torch.zeros_like(batch_group_ids)
             batch_prediction = self._predict_batch(
@@ -809,8 +790,6 @@ class Chronos2Pipeline(BaseChronosPipeline):
         prediction_length: int | None = None,
         quantile_levels: list[float] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
         batch_size: int = 256,
-        context_length: int | None = None,
-        cross_learning: bool = False,
         validate_inputs: bool = True,
         **predict_kwargs,
     ) -> "pd.DataFrame":
@@ -841,18 +820,8 @@ class Chronos2Pipeline(BaseChronosPipeline):
             The batch size used for prediction. Note that the batch size here means the number of time series, including target(s) and covariates,
             which are input into the model. If your data has multiple target and/or covariates, the effective number of time series tasks in a batch
             will be lower than this value, by default 256
-        context_length
-            The maximum context length used during for inference, by default set to the model's default context length
-        cross_learning
-            If True, cross-learning is enabled, i.e., all the tasks in `inputs` will be predicted jointly and the model will share information across all inputs, by default False
-            The following must be noted when using cross-learning:
-            - Cross-learning doesn't always improve forecast accuracy and must be tested for individual use cases.
-            - Results become dependent on batch size. Very large batch sizes may not provide benefits as they deviate from the maximum group size used during pretraining.
-            For optimal results, consider using a batch size around 100 (as used in the Chronos-2 technical report).
-            - Cross-learning is most helpful when individual time series have limited historical context, as the model can leverage patterns from related series in the batch.
         validate_inputs
-            When True, the dataframe(s) will be validated before prediction, ensuring that timestamps have a
-            regular frequency, and item IDs match between past and future data. Setting to False disables these checks.
+            When True, the dataframe(s) will be validated before prediction
         **predict_kwargs
             Additional arguments passed to predict_quantiles
@@ -893,8 +862,6 @@ class Chronos2Pipeline(BaseChronosPipeline):
             quantile_levels=quantile_levels,
             limit_prediction_length=False,
             batch_size=batch_size,
-            context_length=context_length,
-            cross_learning=cross_learning,
             **predict_kwargs,
         )
         # since predict_df tasks are homogenous by input design, we can safely stack the list of tensors into a single tensor
@@ -1056,7 +1023,11 @@ class Chronos2Pipeline(BaseChronosPipeline):
             finetune_kwargs["prediction_length"] = first_window.horizon
             finetune_kwargs["batch_size"] = finetune_kwargs.get("batch_size", batch_size)
-            pipeline = self.fit(inputs=inputs, **finetune_kwargs)
+            try:
+                pipeline = self.fit(inputs=inputs, **finetune_kwargs)
+            except Exception as e:
+                msg = f"Finetuning failed with error: {e}. Continuing with the pretrained model."
+                warnings.warn(msg, category=UserWarning, stacklevel=2)
         predictions_per_window = []
         inference_time_s = 0.0

{chronos_forecasting-2.2.0 → chronos_forecasting-2.2.0rc2}/src/chronos/df_utils.py RENAMED Viewed

@@ -185,13 +185,25 @@ def validate_df_inputs(
         if context_ids != future_ids:
             raise ValueError("future_df must contain the same time series IDs as df")
-        future_series_lengths = future_df[id_column].value_counts(sort=False)
-        if (future_series_lengths != prediction_length).any():
-            invalid_series = future_series_lengths[future_series_lengths != prediction_length]
-            raise ValueError(
-                f"future_df must contain {prediction_length=} values for each series, "
-                f"but found series with different lengths: {invalid_series.to_dict()}"
-            )
+        future_series_lengths = future_df[id_column].value_counts(sort=False).to_list()
+        # Validate future series lengths match prediction_length
+        future_start_idx = 0
+        future_timestamps_index = pd.DatetimeIndex(future_df[timestamp_column])
+        for future_length in future_series_lengths:
+            future_timestamps = future_timestamps_index[future_start_idx : future_start_idx + future_length]
+            future_series_id = future_df[id_column].iloc[future_start_idx]
+            if future_length != prediction_length:
+                raise ValueError(
+                    f"Future covariates all time series must have length {prediction_length}, got {future_length} for series {future_series_id}"
+                )
+            if future_length < 3 or inferred_freq != validate_freq(future_timestamps, future_series_id):
+                raise ValueError(
+                    f"Future covariates must have the same frequency as context, found series {future_series_id} with a different frequency"
+                )
+            future_start_idx += future_length
+        assert len(series_lengths) == len(future_series_lengths)
     return df, future_df, inferred_freq, series_lengths, original_order
@@ -291,16 +303,10 @@ def convert_df_input_to_list_of_dicts_input(
     past_covariates_dict = {
         col: df[col].to_numpy() for col in df.columns if col not in [id_column, timestamp_column] + target_columns
     }
-    future_covariates_dict = {}
     if future_df is not None:
-        for col in future_df.columns.drop([id_column, timestamp_column]):
-            future_covariates_dict[col] = future_df[col].to_numpy()
-        if validate_inputs:
-            if (pd.DatetimeIndex(future_df[timestamp_column]) != pd.DatetimeIndex(prediction_timestamps_array)).any():
-                raise ValueError(
-                    "future_df timestamps do not match the expected prediction timestamps. "
-                    "You can disable this check by setting `validate_inputs=False`"
-                )
+        future_covariates_dict = {
+            col: future_df[col].to_numpy() for col in future_df.columns if col not in [id_column, timestamp_column]
+        }
     for i in range(len(series_lengths)):
         start_idx, end_idx = indptr[i], indptr[i + 1]
@@ -310,12 +316,23 @@ def convert_df_input_to_list_of_dicts_input(
         prediction_timestamps[series_id] = prediction_timestamps_array[future_start_idx:future_end_idx]
         task: dict[str, np.ndarray | dict[str, np.ndarray]] = {"target": target_array[:, start_idx:end_idx]}
+        # Handle covariates if present
         if len(past_covariates_dict) > 0:
             task["past_covariates"] = {col: values[start_idx:end_idx] for col, values in past_covariates_dict.items()}
-            if len(future_covariates_dict) > 0:
-                task["future_covariates"] = {
-                    col: values[future_start_idx:future_end_idx] for col, values in future_covariates_dict.items()
-                }
+            # Handle future covariates
+            if future_df is not None:
+                first_future_timestamp = future_df[timestamp_column].iloc[future_start_idx]
+                assert first_future_timestamp == prediction_timestamps[series_id][0], (
+                    f"the first timestamp in future_df must be the first forecast timestamp, found mismatch "
+                    f"({first_future_timestamp} != {prediction_timestamps[series_id][0]}) in series {series_id}"
+                )
+                if len(future_covariates_dict) > 0:
+                    task["future_covariates"] = {
+                        col: values[future_start_idx:future_end_idx] for col, values in future_covariates_dict.items()
+                    }
         inputs.append(task)
     assert len(inputs) == len(series_lengths)

{chronos_forecasting-2.2.0 → chronos_forecasting-2.2.0rc2}/test/test_chronos2.py RENAMED Viewed

@@ -421,39 +421,43 @@ def test_pipeline_can_evaluate_on_dummy_fev_task(pipeline, task_kwargs):
 @pytest.mark.parametrize(
-    "context_setup, future_setup",
+    "context_setup, future_setup, expected_rows",
     [
         # Targets only
-        ({}, None),
+        ({}, None, 6),  # 2 series * 3 predictions
         # Multiple targets with different context lengths
-        ({"target_cols": ["sales", "revenue", "profit"], "n_points": [10, 17]}, None),
+        (
+            {"target_cols": ["sales", "revenue", "profit"], "n_points": [10, 17]},
+            None,
+            18,
+        ),  # 2 series * 3 targets * 3 predictions
         # With past covariates
-        ({"covariates": ["cov1"]}, None),
+        ({"covariates": ["cov1"]}, None, 6),
         # With future covariates
-        ({"covariates": ["cov1"]}, {"covariates": ["cov1"]}),
+        ({"covariates": ["cov1"]}, {"covariates": ["cov1"], "n_points": [3, 3]}, 6),
         # With past-only and future covariates
-        ({"covariates": ["cov1", "cov2"]}, {"covariates": ["cov1"]}),
+        ({"covariates": ["cov1", "cov2"]}, {"covariates": ["cov1"], "n_points": [3, 3]}, 6),
         # With past-only and future covariates and different series order
         (
             {"series_ids": ["B", "C", "A", "Z"], "n_points": [10, 20, 100, 256], "covariates": ["cov1", "cov2"]},
-            {"series_ids": ["B", "C", "A", "Z"], "covariates": ["cov1"]},
+            {
+                "series_ids": ["B", "C", "A", "Z"],
+                "covariates": ["cov1"],
+                "n_points": [3, 3, 3, 3],
+            },
+            12,
         ),
     ],
 )
 @pytest.mark.parametrize("freq", ["s", "min", "30min", "h", "D", "W", "ME", "QE", "YE"])
-@pytest.mark.parametrize("prediction_length", [1, 4])
 @pytest.mark.parametrize("validate_inputs", [True, False])
 def test_predict_df_works_for_valid_inputs(
-    pipeline, context_setup, future_setup, freq, validate_inputs, prediction_length
+    pipeline, context_setup, future_setup, expected_rows, freq, validate_inputs
 ):
+    prediction_length = 3
     df = create_df(**context_setup, freq=freq)
     forecast_start_times = get_forecast_start_times(df, freq)
-    if future_setup:
-        series_ids = future_setup.get("series_ids", ["A", "B"])
-        future_setup_with_n_points = {**future_setup, "n_points": [prediction_length] * len(series_ids)}
-        future_df = create_future_df(forecast_start_times, **future_setup_with_n_points, freq=freq)
-    else:
-        future_df = None
+    future_df = create_future_df(forecast_start_times, **future_setup, freq=freq) if future_setup else None
     series_ids = context_setup.get("series_ids", ["A", "B"])
     target_columns = context_setup.get("target_cols", ["target"])
@@ -467,7 +471,6 @@ def test_predict_df_works_for_valid_inputs(
         validate_inputs=validate_inputs,
     )
-    expected_rows = n_series * n_targets * prediction_length
     assert len(result) == expected_rows
     assert "item_id" in result.columns and np.all(
         result["item_id"].to_numpy() == np.array(series_ids).repeat(n_targets * prediction_length)
@@ -577,78 +580,24 @@ def test_predict_df_with_future_df_missing_series_raises_error(pipeline):
         pipeline.predict_df(df, future_df=future_df)
-def test_predict_df_with_future_df_with_different_freq_raises_error(pipeline):
-    df = create_df(series_ids=["A", "B"], covariates=["cov1"], freq="h")
-    future_df = create_future_df(
-        get_forecast_start_times(df), series_ids=["A", "B"], n_points=[3, 3], covariates=["cov1"], freq="D"
-    )
-    with pytest.raises(ValueError, match="future_df timestamps do not match"):
-        pipeline.predict_df(df, future_df=future_df, prediction_length=3)
 def test_predict_df_with_future_df_with_different_lengths_raises_error(pipeline):
     df = create_df(series_ids=["A", "B"], covariates=["cov1"])
     future_df = create_future_df(
         get_forecast_start_times(df), series_ids=["A", "B"], n_points=[3, 7], covariates=["cov1"]
     )
-    with pytest.raises(ValueError, match="future_df must contain prediction"):
+    with pytest.raises(ValueError, match="all time series must have length"):
         pipeline.predict_df(df, future_df=future_df, prediction_length=3)
-@pytest.mark.parametrize(
-    "context_setup, future_setup",
-    [
-        # Targets only
-        ({}, None),
-        # Multiple targets with different context lengths
-        ({"target_cols": ["sales", "revenue", "profit"], "n_points": [10, 17]}, None),
-        # With past covariates
-        ({"covariates": ["cov1"]}, None),
-        # With future covariates
-        ({"covariates": ["cov1"]}, {"covariates": ["cov1"]}),
-        # With past-only and future covariates
-        ({"covariates": ["cov1", "cov2"]}, {"covariates": ["cov1"]}),
-        # With past-only and future covariates and different series order
-        (
-            {"series_ids": ["B", "C", "A", "Z"], "n_points": [10, 20, 100, 256], "covariates": ["cov1", "cov2"]},
-            {"series_ids": ["B", "C", "A", "Z"], "covariates": ["cov1"]},
-        ),
-    ],
-)
-@pytest.mark.parametrize("prediction_length", [1, 4])
-def test_predict_df_outputs_different_results_with_cross_learning_enabled(
-    pipeline, context_setup, future_setup, prediction_length
-):
-    freq = "h"
-    df = create_df(**context_setup, freq=freq)
-    forecast_start_times = get_forecast_start_times(df, freq)
-    if future_setup:
-        series_ids = future_setup.get("series_ids", ["A", "B"])
-        future_setup_with_n_points = {**future_setup, "n_points": [prediction_length] * len(series_ids)}
-        future_df = create_future_df(forecast_start_times, **future_setup_with_n_points, freq=freq)
-    else:
-        future_df = None
-    series_ids = context_setup.get("series_ids", ["A", "B"])
-    target_columns = context_setup.get("target_cols", ["target"])
-    result_with_cross_learning = pipeline.predict_df(
-        df,
-        future_df=future_df,
-        target=target_columns,
-        prediction_length=prediction_length,
-        cross_learning=True,
-    )
-    result_without_cross_learning = pipeline.predict_df(
-        df,
-        future_df=future_df,
-        target=target_columns,
-        prediction_length=prediction_length,
-        cross_learning=False,
+def test_predict_df_with_future_df_with_different_freq_raises_error(pipeline):
+    df = create_df(series_ids=["A", "B"], covariates=["cov1"], freq="h")
+    future_df = create_future_df(
+        get_forecast_start_times(df), series_ids=["A", "B"], n_points=[3, 3], covariates=["cov1"], freq="D"
     )
-    assert not np.array_equal(result_with_cross_learning["predictions"], result_without_cross_learning["predictions"])
+    with pytest.raises(ValueError, match="must have the same frequency as context"):
+        pipeline.predict_df(df, future_df=future_df, prediction_length=3)
 @pytest.mark.parametrize(
@@ -925,36 +874,40 @@ def test_when_input_time_series_are_too_short_then_finetuning_raises_error(pipel
 @pytest.mark.parametrize(
-    "context_setup, future_setup",
+    "context_setup, future_setup, expected_rows",
     [
         # Targets only
-        ({}, None),
+        ({}, None, 6),  # 2 series * 3 predictions
         # Multiple targets with different context lengths
-        ({"target_cols": ["sales", "revenue", "profit"], "n_points": [10, 17]}, None),
+        (
+            {"target_cols": ["sales", "revenue", "profit"], "n_points": [10, 17]},
+            None,
+            18,
+        ),  # 2 series * 3 targets * 3 predictions
         # With past covariates
-        ({"covariates": ["cov1"]}, None),
+        ({"covariates": ["cov1"]}, None, 6),
         # With future covariates
-        ({"covariates": ["cov1"]}, {"covariates": ["cov1"]}),
+        ({"covariates": ["cov1"]}, {"covariates": ["cov1"], "n_points": [3, 3]}, 6),
         # With past-only and future covariates
-        ({"covariates": ["cov1", "cov2"]}, {"covariates": ["cov1"]}),
+        ({"covariates": ["cov1", "cov2"]}, {"covariates": ["cov1"], "n_points": [3, 3]}, 6),
         # With past-only and future covariates and different series order
         (
             {"series_ids": ["B", "C", "A", "Z"], "n_points": [10, 20, 100, 256], "covariates": ["cov1", "cov2"]},
-            {"series_ids": ["B", "C", "A", "Z"], "covariates": ["cov1"]},
+            {
+                "series_ids": ["B", "C", "A", "Z"],
+                "covariates": ["cov1"],
+                "n_points": [3, 3, 3, 3],
+            },
+            12,
         ),
     ],
 )
 @pytest.mark.parametrize("freq", ["h", "D", "ME"])
-def test_two_step_finetuning_with_df_input_works(pipeline, context_setup, future_setup, freq):
+def test_two_step_finetuning_with_df_input_works(pipeline, context_setup, future_setup, expected_rows, freq):
     prediction_length = 3
     df = create_df(**context_setup, freq=freq)
     forecast_start_times = get_forecast_start_times(df, freq)
-    if future_setup:
-        series_ids = future_setup.get("series_ids", ["A", "B"])
-        future_setup_with_n_points = {**future_setup, "n_points": [prediction_length] * len(series_ids)}
-        future_df = create_future_df(forecast_start_times, **future_setup_with_n_points, freq=freq)
-    else:
-        future_df = None
+    future_df = create_future_df(forecast_start_times, **future_setup, freq=freq) if future_setup else None
     series_ids = context_setup.get("series_ids", ["A", "B"])
     target_columns = context_setup.get("target_cols", ["target"])
@@ -987,7 +940,6 @@ def test_two_step_finetuning_with_df_input_works(pipeline, context_setup, future
     )
     # Check predictions from the fine-tuned model are valid
-    expected_rows = n_series * n_targets * prediction_length
     assert len(result) == expected_rows
     assert "item_id" in result.columns and np.all(
         result["item_id"].to_numpy() == np.array(series_ids).repeat(n_targets * prediction_length)