PyEvoMotion 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/PKG-INFO +72 -4
  2. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/PyEvoMotion/cli.py +1 -8
  3. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/PyEvoMotion/core/base.py +83 -20
  4. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/PyEvoMotion/core/core.py +82 -38
  5. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/PyEvoMotion/core/parser.py +4 -1
  6. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/README.md +72 -3
  7. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/pyproject.toml +1 -1
  8. pyevomotion-0.1.1/share/anomalous_diffusion.pdf +0 -0
  9. pyevomotion-0.1.1/share/figUK.tsv +9949 -0
  10. pyevomotion-0.1.1/share/figUK_plots.pdf +0 -0
  11. pyevomotion-0.1.1/share/figUK_regression_results.json +18 -0
  12. pyevomotion-0.1.1/share/figUK_run_args.json +13 -0
  13. pyevomotion-0.1.1/share/figUK_stats.tsv +41 -0
  14. pyevomotion-0.1.1/share/figUSA.tsv +9470 -0
  15. pyevomotion-0.1.1/share/figUSA_plots.pdf +0 -0
  16. pyevomotion-0.1.1/share/figUSA_regression_results.json +18 -0
  17. pyevomotion-0.1.1/share/figUSA_run_args.json +13 -0
  18. pyevomotion-0.1.1/share/figUSA_stats.tsv +34 -0
  19. pyevomotion-0.1.1/share/figdataUK.tsv +10001 -0
  20. pyevomotion-0.1.1/share/figdataUSA.tsv +10001 -0
  21. pyevomotion-0.1.1/share/figure.pdf +0 -0
  22. pyevomotion-0.1.1/share/generate_sequences_from_synthdata.py +85 -0
  23. pyevomotion-0.1.1/share/manuscript_figure.py +761 -0
  24. pyevomotion-0.1.1/share/synth_figure.pdf +0 -0
  25. pyevomotion-0.1.1/share/uk_time_windows.pdf +0 -0
  26. pyevomotion-0.1.1/share/weekly_size.pdf +0 -0
  27. pyevomotion-0.1.1/tests/data/test1/output/test1.data.tsv +101 -0
  28. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/test_UK_USA_dataset.py +43 -10
  29. pyevomotion-0.1.1/tests/test_synthetic_datasets.py +55 -0
  30. pyevomotion-0.1.0/share/manuscript_figure.py +0 -316
  31. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/PyEvoMotion/__init__.py +0 -0
  32. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/PyEvoMotion/core/__init__.py +0 -0
  33. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/PyEvoMotion/utils.py +0 -0
  34. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/share/mafft_install.sh +0 -0
  35. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/__init__.py +0 -0
  36. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/data/test1/out_run_args.json +0 -0
  37. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/data/test1/test1.metadata.tsv +0 -0
  38. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/data/test1/test1.sequences.fasta +0 -0
  39. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/data/test2/out_run_args.json +0 -0
  40. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/data/test2/test2.metadata.parquet.gz +0 -0
  41. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/data/test3/ids_sampled_for_figure.json +0 -0
  42. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/helpers/test_UK_USA_dataset_helpers.py +0 -0
  43. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/helpers/test_parser_helpers.py +0 -0
  44. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/test_core.py +0 -0
  45. {pyevomotion-0.1.0 → pyevomotion-0.1.1}/tests/test_parser.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PyEvoMotion
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Evolutionary motion analysis tool
5
5
  Keywords: evolution,anomalous diffusion,bioinformatics
6
6
  Author: Lucas Goiriz
@@ -27,7 +27,7 @@ _(See [Goiriz L, et al.](http://doi.org/10.1073/pnas.2303578120))_
27
27
  ## Installation
28
28
 
29
29
  > **Note:**
30
- > `PyEvoMotion` uses [mafft](https://mafft.cbrc.jp/alignment/software/) to do the sequence alignment. If its not available in your system, on the the first run of `PyEvoMotion`, it will ask to install it locally.
30
+ > `PyEvoMotion` uses [mafft](https://mafft.cbrc.jp/alignment/software/) to do the sequence alignment. If it's not available in your system, on the first run of `PyEvoMotion`, it will ask to install it locally.
31
31
  >
32
32
  > If so, ensure to restart your shell session or run `source ~/.bashrc` to update the PATH environment variable, so that the `mafft` executable is available in your shell.
33
33
  >
@@ -74,8 +74,6 @@ options:
74
74
  -ep, --export_plots Export the plots of the analysis.
75
75
  -l LENGTH_FILTER, --length_filter LENGTH_FILTER
76
76
  Length filter for the sequences (removes sequences with length less than the specified value). Default is 0.
77
- -n N_THRESHOLD, --n_threshold N_THRESHOLD
78
- Minimum number of sequences required in a time interval to compute statistics. Default is 2.
79
77
  -xj, --export_json Export the run arguments to a json file.
80
78
  -ij IMPORT_JSON, --import_json IMPORT_JSON
81
79
  Import the run arguments from a JSON file. If this argument is passed, the other arguments are ignored. The JSON file must contain the mandatory keys 'seqs', 'meta', and 'out'.
@@ -114,4 +112,74 @@ pytest
114
112
  > Given the size of the test data, this may take a while.
115
113
 
116
114
 
115
+ ## Docker
117
116
 
117
+ A Docker image containing a virtual environment with `PyEvoMotion` pre-installed, its dependencies, the test data is available at `ghcr.io/luksgrin/pyevomotion:latest` and the manuscript's original figure script is available at `ghcr.io/luksgrin/pyevomotion-fig:latest`.
118
+
119
+ Pull the image from by running:
120
+
121
+ ```bash
122
+ docker pull ghcr.io/luksgrin/pyevomotion:latest
123
+ ```
124
+
125
+ Alternatively, to build the main image, run:
126
+
127
+ ```bash
128
+ docker build -t ghcr.io/luksgrin/pyevomotion:latest -f docker/Dockerfile
129
+ ```
130
+
131
+ ### Running the container
132
+
133
+ To start an interactive container:
134
+
135
+ ```bash
136
+ docker run -it ghcr.io/luksgrin/pyevomotion:latest
137
+ ```
138
+
139
+ This will open a prompt that displays a welcome message and allows you to start using `PyEvoMotion` right away.
140
+
141
+ ### Included data
142
+
143
+ The image includes (heavy) input files (FASTA and metadata) in:
144
+
145
+ ```bash
146
+ /home/pyevomotion/pyevomotion-*/tests/data/test3
147
+ ```
148
+
149
+ which are used by the test suite (and are automatically downloaded and extracted if not present, thereby using the containerized version is more convenient).
150
+
151
+ Also, the source script for figure generation (along with the pre-generated results of running `PyEvoMotion`) is already available under:
152
+
153
+ ```bash
154
+ /home/pyevomotion/pyevomotion-*/share
155
+ ```
156
+
157
+ Do note that if all the contents within
158
+
159
+ ```bash
160
+ /home/pyevomotion/pyevomotion-*/share
161
+ ```
162
+
163
+ are deleted except for the `manuscript_figure.py` script, it is still possible to generate the figure (although it will take much longer since the dataset's stats must be computed by `PyEvoMotion`).
164
+
165
+ ### Running tests
166
+
167
+ Once inside the container, run:
168
+
169
+ ```bash
170
+ cd pyevomotion-*
171
+ pytest
172
+ ```
173
+
174
+ This will execute the test suite included with the source.
175
+
176
+ ### Reproducing the Figure from the original manuscript
177
+
178
+ To reproduce the figure from the original manuscript, run:
179
+
180
+ ```bash
181
+ cd pyevomotion-*
182
+ python share/manuscript_figure.py export
183
+ ```
184
+
185
+ The figure will be saved in the `share` directory. Font warnings may appear — they are safe to ignore and do not affect the scientific content of the figure, only the styling.
@@ -255,13 +255,6 @@ def _parse_arguments() -> argparse.Namespace:
255
255
  default=0,
256
256
  help="Length filter for the sequences (removes sequences with length less than the specified value). Default is 0."
257
257
  )
258
- parser.add_argument(
259
- "-n",
260
- "--n_threshold",
261
- type=int,
262
- default=2,
263
- help="Minimum number of sequences required in a time interval to compute statistics. Default is 2."
264
- )
265
258
  parser.add_argument(
266
259
  "-xj",
267
260
  "--export_json",
@@ -407,7 +400,6 @@ def _main():
407
400
  # Runs the analysis
408
401
  stats, reg = instance.analysis(
409
402
  length=args.length_filter,
410
- n_threshold=args.n_threshold,
411
403
  show=args.show,
412
404
  mutation_kind=args.kind,
413
405
  export_plots_filename=(
@@ -432,6 +424,7 @@ def _main():
432
424
  # Exports the regression models to a JSON file
433
425
  with open(f"{args.out}_regression_results.json", "w") as file:
434
426
  json.dump(_reg, file, indent=4)
427
+ print(f"Regression results saved to {args.out}_regression_results.json")
435
428
 
436
429
  # Exits the program with code 0 (success)
437
430
  exit(0)
@@ -102,7 +102,7 @@ class PyEvoMotionBase():
102
102
  print(f"Method {method} not found in {instance}")
103
103
 
104
104
  @staticmethod
105
- def _remove_nan(x: pd.Series, y: pd.Series) -> tuple[np.ndarray, np.ndarray]:
105
+ def _remove_nan(x: pd.Series, y: pd.Series, z: pd.Series) -> tuple[np.ndarray, np.ndarray]:
106
106
  """
107
107
  Remove NaN values from two pandas Series and return them as numpy arrays.
108
108
 
@@ -110,22 +110,40 @@ class PyEvoMotionBase():
110
110
  :type x: pd.Series
111
111
  :param y: the second pandas Series.
112
112
  :type y: pd.Series
113
+ :param z: the third pandas Series.
114
+ :type z: pd.Series
113
115
  :return: a tuple with the two pandas Series without NaN values.
114
116
  :rtype: tuple[np.ndarray,np.ndarray]
115
117
  """
116
118
 
117
- data = pd.DataFrame({"x": x, "y": y}).dropna()
119
+ data = pd.DataFrame({"x": x, "y": y, "z": z}).dropna()
118
120
 
119
121
  x = data["x"].to_numpy().reshape(-1, 1)
120
122
  y = data["y"].to_numpy().reshape(-1, 1)
123
+ z = data["z"].to_numpy().reshape(-1, 1)
124
+ return x, y, z
121
125
 
122
- return x, y
126
+ @staticmethod
127
+ def _weighting_function(n: int, n_0: int = 30) -> np.ndarray:
128
+ """
129
+ Weighting function for the data points.
130
+
131
+ :param n: The number of data points.
132
+ :type n: int
133
+ :param n_0: The number of data points at which the weighting function approximates the constant 1. Default is 30.
134
+ :type n_0: int
135
+ :return: The weighting function.
136
+ :rtype: np.ndarray
137
+ """
138
+
139
+ return np.tanh(2*n/n_0)
123
140
 
124
141
  @classmethod
125
142
  def linear_regression(cls,
126
143
  x: np.ndarray,
127
144
  y: np.ndarray,
128
- fit_intercept=True
145
+ weights: np.ndarray | None = None,
146
+ fit_intercept: bool = True
129
147
  ) -> dict[str, any]:
130
148
  """
131
149
  Perform a linear regression on a set of data.
@@ -136,6 +154,8 @@ class PyEvoMotionBase():
136
154
  :type y: np.ndarray
137
155
  :param fit_intercept: Whether to fit the intercept. Default is ``True``.
138
156
  :type fit_intercept: bool
157
+ :param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
158
+ :type weights: np.ndarray | None
139
159
  :return: A dictionary containing:
140
160
 
141
161
  * ``model``: A ``lambda`` function that computes predictions based on the fitted model.
@@ -145,7 +165,9 @@ class PyEvoMotionBase():
145
165
  :rtype: ``dict[str, any]``
146
166
  """
147
167
 
148
- reg = LinearRegression(fit_intercept=fit_intercept).fit(x,y)
168
+ _weights = cls._weighting_function(weights).flatten() if weights is not None else None
169
+
170
+ reg = LinearRegression(fit_intercept=fit_intercept).fit(x, y, sample_weight=_weights)
149
171
 
150
172
  if fit_intercept:
151
173
  model = {
@@ -166,7 +188,7 @@ class PyEvoMotionBase():
166
188
  "expression": "mx"
167
189
  }
168
190
 
169
- model["r2"] = r2_score(y, reg.predict(x))
191
+ model["r2"] = r2_score(y, reg.predict(x), sample_weight=_weights)
170
192
 
171
193
  return model
172
194
 
@@ -192,7 +214,7 @@ class PyEvoMotionBase():
192
214
  return a*np.power(x, b)
193
215
 
194
216
  @classmethod
195
- def power_law_fit(cls, x: np.ndarray, y: np.ndarray) -> dict[str, any]:
217
+ def power_law_fit(cls, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None) -> dict[str, any]:
196
218
  """
197
219
  Perform a power law fit on a set of data.
198
220
 
@@ -200,6 +222,8 @@ class PyEvoMotionBase():
200
222
  :type x: np.ndarray
201
223
  :param y: A numpy array of the target.
202
224
  :type y: np.ndarray
225
+ :param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
226
+ :type weights: np.ndarray | None
203
227
  :return: A dictionary containing:
204
228
 
205
229
  * ``model``: A ``lambda`` function that computes predictions based on the fitted model.
@@ -209,10 +233,13 @@ class PyEvoMotionBase():
209
233
  :rtype: ``dict[str, any]``
210
234
  """
211
235
 
236
+ _weights = cls._weighting_function(weights).flatten() if weights is not None else None
237
+
212
238
  try:
213
239
  _popt, _, _, _msg, _ier = curve_fit(
214
240
  cls._power_law,
215
241
  x.T.tolist()[0], y.T.tolist()[0],
242
+ sigma=1/np.sqrt(_weights) if _weights is not None else None,
216
243
  full_output=True
217
244
  )
218
245
  except RuntimeError as e:
@@ -230,16 +257,18 @@ class PyEvoMotionBase():
230
257
  "alpha": _popt[1]
231
258
  },
232
259
  "expression": "d*x^alpha",
233
- "r2": r2_score(y, cls._power_law(x, *_popt))
260
+ "r2": r2_score(y, cls._power_law(x, *_popt), sample_weight=_weights)
234
261
  }
235
262
 
236
263
  return model
237
264
 
238
- @staticmethod
265
+ @classmethod
239
266
  def F_test(
267
+ cls,
240
268
  model1: dict[str,any],
241
269
  model2: dict[str,any],
242
- data: np.ndarray
270
+ data: np.ndarray,
271
+ weights: np.ndarray | None = None
243
272
  ) -> tuple[float, float]:
244
273
  """
245
274
  Perform an F-test between two models.
@@ -257,6 +286,11 @@ class PyEvoMotionBase():
257
286
  """
258
287
 
259
288
  data = data.flatten()
289
+
290
+ if weights is not None:
291
+ _weights = cls._weighting_function(weights.flatten())
292
+ else:
293
+ _weights = np.ones(len(data))
260
294
 
261
295
  # Note that p1 < p2 always. Won't do an assertion because I'm making sure elsewhere that the linear model does not have an intercept, i.e. it only has the slope
262
296
  p1 = len(model1["parameters"])
@@ -278,8 +312,8 @@ class PyEvoMotionBase():
278
312
  )
279
313
 
280
314
  # Sum the residuals without the infinite values
281
- RSS1 = RS1.sum(where=~mask)
282
- RSS2 = RS2.sum(where=~mask)
315
+ RSS1 = np.sum(_weights*RS1, where=~mask)
316
+ RSS2 = np.sum(_weights*RS2, where=~mask)
283
317
 
284
318
  F = ((RSS1 - RSS2)/(p2 - p1))/(RSS2/(n - p2))
285
319
 
@@ -289,7 +323,8 @@ class PyEvoMotionBase():
289
323
  def adjust_model(cls,
290
324
  x: pd.Series,
291
325
  y: pd.Series,
292
- name: str = None
326
+ name: str = None,
327
+ weights: pd.Series | None = None
293
328
  ) -> dict[str, any]:
294
329
  """Adjust a model to the data.
295
330
 
@@ -299,12 +334,14 @@ class PyEvoMotionBase():
299
334
  :type y: pd.Series
300
335
  :param name: The name of the data. Default is ``None``.
301
336
  :type name: str
337
+ :param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
338
+ :type weights: np.ndarray | None
302
339
  :return: A dictionary with the model.
303
340
  :rtype: ``dict[str, any]``
304
341
  :raises ValueError: If the dataset is empty or full of NaN values. This may occur if the grouped data contains only one entry per group, indicating that the variance cannot be computed.
305
342
  """
306
343
 
307
- x,y = cls._remove_nan(x, y)
344
+ x,y,w = cls._remove_nan(x, y, weights)
308
345
 
309
346
  # Raises an error if the dataset is (almost) empty at this point
310
347
  if (x.size <= 1) or (y.size <= 1):
@@ -313,10 +350,10 @@ class PyEvoMotionBase():
313
350
  f"Dataset length after filtering is: x: {x.size} elements; y: {y.size} elements. In particular:\n\nx: {x}\ny: {y}\n\nPerhaps NaN appeared for certain entries. Check if the grouped data contains only one entry per group, as this may cause NaN values when computing the variance. Also, consider widening the time window."
314
351
  )
315
352
 
316
- model1 = cls.linear_regression(x, y, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
317
- model2 = cls.power_law_fit(x, y)
353
+ model1 = cls.linear_regression(x, y, weights=w, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
354
+ model2 = cls.power_law_fit(x, y, weights=w)
318
355
 
319
- _, p = cls.F_test(model1, model2, y)
356
+ _, p = cls.F_test(model1, model2, y, weights=w)
320
357
 
321
358
  if p < 0.05:
322
359
  model = model2
@@ -337,6 +374,7 @@ class PyEvoMotionBase():
337
374
  model_label: str,
338
375
  data_xlabel_units: str,
339
376
  ax: any,
377
+ dt_ratio: float,
340
378
  **kwargs: dict[str, any]
341
379
  ) -> None:
342
380
  """
@@ -376,13 +414,13 @@ class PyEvoMotionBase():
376
414
  point_kwargs[_k] = kwargs[k]
377
415
 
378
416
  ax.scatter(
379
- data_x,
417
+ data_x.to_numpy()*dt_ratio,
380
418
  data_y,
381
419
  **point_kwargs
382
420
  )
383
421
  ax.plot(
384
- data_x,
385
- model(data_x),
422
+ data_x.to_numpy()*dt_ratio,
423
+ model(data_x.to_numpy()*dt_ratio),
386
424
  label=model_label,
387
425
  **line_kwargs
388
426
  )
@@ -404,3 +442,28 @@ class PyEvoMotionBase():
404
442
  raise ValueError(
405
443
  f"The dataset is (almost) empty at this point of the analysis.\n{msg}"
406
444
  )
445
+
446
+ @staticmethod
447
+ def _get_time_ratio(dt: str, reference: str = "7D") -> float:
448
+ """Get the ratio of a time interval with respect to a reference interval.
449
+
450
+ :param dt: Time interval string (e.g. "5D", "7D", "10D", "14D", "12H")
451
+ :type dt: str
452
+ :param reference: Reference time interval string. Default is "7D".
453
+ :type reference: str
454
+ :return: The ratio of dt to reference
455
+ :rtype: float
456
+ """
457
+
458
+ return pd.Timedelta(dt) / pd.Timedelta(reference)
459
+
460
+ @classmethod
461
+ def _verify_dt(cls, dt: str) -> None:
462
+ """Verify that the time window string is greater than 1 day.
463
+
464
+ :param dt: Time window string (e.g. "5D", "7D", "10D", "14D")
465
+ :type dt: str
466
+ :raises ValueError: If the time window is not greater than 1 day
467
+ """
468
+ if cls._get_time_ratio(dt, "1D") <= 1:
469
+ raise ValueError(f"Time window must be greater than 1 day. Got {dt}")
@@ -62,7 +62,9 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
62
62
  :type date_range: tuple[str] | None
63
63
  """
64
64
 
65
+ self._verify_dt(dt)
65
66
  self.dt = dt
67
+ self.dt_ratio = self._get_time_ratio(dt)
66
68
 
67
69
  # Parse the input fasta and metadata files
68
70
  super().__init__(
@@ -89,7 +91,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
89
91
  def plot_results(cls,
90
92
  stats: pd.DataFrame,
91
93
  regs: dict[str, dict[str, any]],
92
- data_xlabel_units: str
94
+ data_xlabel_units: str,
95
+ dt_ratio: float
93
96
  ) -> None:
94
97
  """
95
98
  Plot the results of the analysis.
@@ -110,7 +113,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
110
113
  for k,v in regs.items()
111
114
  if k.startswith("mean")
112
115
  )
113
- _mean_data = stats[stats.columns[1]]
116
+ _mean_data = stats[stats.columns[2]]
114
117
  cls.plot_single_data_and_model(
115
118
  stats.index,
116
119
  _mean_data,
@@ -118,7 +121,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
118
121
  _model["model"],
119
122
  r"$r^2$: " + f"{_model['r2']:.2f}",
120
123
  data_xlabel_units,
121
- ax[0]
124
+ ax[0],
125
+ dt_ratio=dt_ratio
122
126
  )
123
127
 
124
128
  # Variance
@@ -127,7 +131,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
127
131
  for k,v in regs.items()
128
132
  if k.startswith("scaled var")
129
133
  )
130
- _variance_data = stats[stats.columns[2]]
134
+ _variance_data = stats[stats.columns[3]]
131
135
  cls.plot_single_data_and_model(
132
136
  stats.index,
133
137
  _variance_data,
@@ -135,7 +139,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
135
139
  _model["model"],
136
140
  r"$r^2$: " + f"{_model['r2']:.2f}",
137
141
  data_xlabel_units,
138
- ax[1]
142
+ ax[1],
143
+ dt_ratio=dt_ratio
139
144
  )
140
145
 
141
146
  # Dispersion index
@@ -147,6 +152,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
147
152
  "Poissonian regime",
148
153
  data_xlabel_units,
149
154
  ax[2],
155
+ dt_ratio=dt_ratio,
150
156
  line_linestyle="--",
151
157
  line_color="black"
152
158
  )
@@ -159,6 +165,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
159
165
  stats: pd.DataFrame,
160
166
  regs: dict[str, dict[str, any]],
161
167
  data_xlabel_units: str,
168
+ dt_ratio: float,
162
169
  output_ptr: str | None = None
163
170
  ) -> None:
164
171
  """
@@ -183,7 +190,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
183
190
  for k,v in regs.items()
184
191
  if k.startswith("mean")
185
192
  )
186
- _mean_data = stats[stats.columns[1]]
193
+ _mean_data = stats[stats.columns[2]]
187
194
  cls.plot_single_data_and_model(
188
195
  stats.index,
189
196
  _mean_data,
@@ -191,7 +198,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
191
198
  _model["model"],
192
199
  r"$r^2$: " + f"{_model['r2']:.2f}",
193
200
  data_xlabel_units,
194
- plt.gca()
201
+ plt.gca(),
202
+ dt_ratio=dt_ratio
195
203
  )
196
204
 
197
205
  plt.title(_mean_data.name)
@@ -205,7 +213,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
205
213
  for k,v in regs.items()
206
214
  if k.startswith("scaled var")
207
215
  )
208
- _variance_data = stats[stats.columns[2]]
216
+ _variance_data = stats[stats.columns[3]]
209
217
  cls.plot_single_data_and_model(
210
218
  stats.index,
211
219
  _variance_data,
@@ -213,7 +221,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
213
221
  lambda x: _model["model"](x) + _variance_data.min(), # Adjust the model to the original variance
214
222
  r"$r^2$: " + f"{_model['r2']:.2f}",
215
223
  data_xlabel_units,
216
- plt.gca()
224
+ plt.gca(),
225
+ dt_ratio=dt_ratio
217
226
  )
218
227
 
219
228
  plt.title(_variance_data.name)
@@ -232,6 +241,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
232
241
  "Poissonian regime",
233
242
  data_xlabel_units,
234
243
  plt.gca(),
244
+ dt_ratio=dt_ratio,
235
245
  line_linestyle="--",
236
246
  line_color="black"
237
247
  )
@@ -360,7 +370,6 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
360
370
  def compute_stats(self,
361
371
  DT: str,
362
372
  origin: str,
363
- n_threshold: int | None = None,
364
373
  mutation_kind: str = "all"
365
374
  ) -> pd.DataFrame:
366
375
  """
@@ -372,31 +381,37 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
372
381
  :type DT: str
373
382
  :param origin: The string datetime that will be the origin of the grouping.
374
383
  :type origin: str
375
- :param n_threshold: Minimum number of sequences required in a time interval to compute statistics.
376
- :type n_threshold: int | None
377
384
  :param mutation_kind: The kind of mutation to compute the statistics for. Has to be one of ``all``, ``total``, ``substitutions``, ``insertions``, ``deletions`` or ``indels``. Default is ``all``.
378
385
  :return: The statistics of the data.
379
386
  :rtype: ``pd.DataFrame``
380
387
  """
381
388
 
382
- grouped = self.date_grouper(self.data, DT, origin)
389
+ # Create a local copy of the data
390
+ _data = self.data.copy()
383
391
 
384
- # Only keep weeks where the number of observations is greater than the threshold
385
- if n_threshold:
392
+ # If the very first row's date is the same as the origin, and there happens to be only one entry for that date, duplicate that row; this way the stats for the first week can be computed (with variance = 0 of course)
393
+ if _data.iloc[0]["date"] == origin and len(_data[_data["date"] == origin]) == 1:
394
+ _data = pd.concat([_data, pd.DataFrame([_data.iloc[0]])], ignore_index=True)
395
+ _data.sort_values(by="date", inplace=True)
396
+ _data.reset_index(drop=True, inplace=True)
386
397
 
387
- _filtered = grouped.filter(lambda x: len(x) >= n_threshold)
398
+ # Group the data by the datetime interval
399
+ grouped = self.date_grouper(_data, DT, origin)
388
400
 
389
- if len(_filtered) == 0:
390
- raise ValueError(
391
- f"No groups with at least {n_threshold} observations. Consider lowering the threshold."
392
- )
401
+ # Only keep weeks where the number of observations is greater than 1
402
+ _filtered = grouped.filter(lambda x: len(x) >= 2)
393
403
 
394
- grouped = self.date_grouper(
395
- _filtered,
396
- DT,
397
- origin
404
+ if len(_filtered) == 0:
405
+ raise ValueError(
406
+ f"No groups with at least 2 observations. Consider widening the time interval."
398
407
  )
399
408
 
409
+ grouped = self.date_grouper(
410
+ _filtered,
411
+ DT,
412
+ origin
413
+ )
414
+
400
415
  levels = [
401
416
  f"number of {x}"
402
417
  for x in self._mutation_type_switch(mutation_kind)
@@ -416,7 +431,6 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
416
431
 
417
432
  def analysis(self,
418
433
  length: int,
419
- n_threshold: int | None = None,
420
434
  show: bool = False,
421
435
  mutation_kind: str = "all",
422
436
  export_plots_filename: str | None = None
@@ -428,7 +442,6 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
428
442
 
429
443
  :param length: The length to filter by.
430
444
  :type length: int
431
- :param n_threshold: Minimum number of sequences required in a time interval to compute statistics.
432
445
  :param show: Whether to show the plots or not. Default is False.
433
446
  :type show: bool
434
447
  :param mutation_kind: The kind of mutation to compute the statistics for. Has to be one of ``all``, ``total``, ``substitutions`` or ``indels``. Default is ``all``.
@@ -447,20 +460,22 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
447
460
  stats = self.compute_stats(
448
461
  self.dt,
449
462
  self.origin,
450
- n_threshold,
451
463
  mutation_kind
452
464
  )
453
465
 
466
+ # Get weights for weighted fitting
467
+ weights = stats["size"]
454
468
 
455
469
  regs = {}
456
470
  # For each column in the statistics (except the date and the size), compute the corresponding regression model
457
471
  for col in stats.columns[1:-1]:
458
472
  if col.startswith("mean"):
459
473
  _single_regression = {
460
- f"{col} per {self.dt} model": self.linear_regression(
474
+ f"{col} model": self.linear_regression(
461
475
  *self._remove_nan(
462
476
  stats.index, # Regression is given by the index, so in time, it is the same as multiplying by dt days
463
- stats[col]
477
+ stats[col],
478
+ weights
464
479
  )
465
480
  )
466
481
  }
@@ -468,33 +483,59 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
468
483
  _single_regression = self.adjust_model(
469
484
  stats.index,
470
485
  stats[col] - stats[col].min(),
471
- name=f"scaled {col} per {self.dt} model"
486
+ name=f"scaled {col} model",
487
+ weights=weights.to_numpy().flatten()
472
488
  )
473
489
  # Save the regression model
474
490
  regs.update(_single_regression)
475
491
 
492
+ # Add scaling correction to the regression models
493
+ for k, v in regs.items():
494
+ if v["expression"] == "mx + b":
495
+ m = v["parameters"]["m"]
496
+ b = v["parameters"]["b"]
497
+ regs[k]["parameters"]["m"] = m/self.dt_ratio
498
+ m = regs[k]["parameters"]["m"]
499
+ regs[k]["model"] = lambda x: m*x + b
500
+ elif v["expression"] == "mx":
501
+ m = v["parameters"]["m"]
502
+ regs[k]["parameters"]["m"] = m/self.dt_ratio
503
+ m = regs[k]["parameters"]["m"]
504
+ regs[k]["model"] = lambda x: m*x
505
+
506
+ elif v["expression"] == "d*x^alpha":
507
+ d = v["parameters"]["d"]
508
+ alpha = v["parameters"]["alpha"]
509
+ regs[k]["parameters"]["d"] = d/(self.dt_ratio**alpha)
510
+ d = regs[k]["parameters"]["d"]
511
+ regs[k]["model"] = lambda x: d*(x**alpha)
512
+
476
513
  # Sets of mutation types used in the analysis
477
514
  _sets = sorted({
478
515
  " ".join(x.split()[1:])
479
516
  for x in stats.columns[1:-1]
480
517
  })
481
518
 
519
+ stats["dt_idx"] = (stats["date"] - stats["date"].min()) / pd.Timedelta("7D")
520
+
482
521
  # Plot the results
483
522
  if show:
484
523
  # For each set of mutation types
485
524
  for _type in _sets:
486
525
  self.plot_results(
487
- stats[["date", f"mean {_type}", f"var {_type}"]],
526
+ stats[["date", "dt_idx", f"mean {_type}", f"var {_type}"]],
488
527
  {
489
528
  k: v
490
529
  for k, v in regs.items()
491
530
  if k in (
492
- f"mean {_type} per {self.dt} model",
493
- f"scaled var {_type} per {self.dt} model"
531
+ f"mean {_type} model",
532
+ f"scaled var {_type} model"
494
533
  )
495
534
  },
496
- f"in steps of {self.dt} since {self.origin}"
535
+ "wk",
536
+ self.dt_ratio
497
537
  )
538
+
498
539
  # Export the plots
499
540
  if export_plots_filename:
500
541
  # Open pdf file pointer
@@ -502,19 +543,22 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
502
543
  # For each set of mutation types save the plots
503
544
  for _type in _sets:
504
545
  self.export_plot_results(
505
- stats[["date", f"mean {_type}", f"var {_type}"]],
546
+ stats[["date", "dt_idx", f"mean {_type}", f"var {_type}"]],
506
547
  {
507
548
  k: v
508
549
  for k, v in regs.items()
509
550
  if k in (
510
- f"mean {_type} per {self.dt} model",
511
- f"scaled var {_type} per {self.dt} model"
551
+ f"mean {_type} model",
552
+ f"scaled var {_type} model"
512
553
  )
513
554
  },
514
- f"in steps of {self.dt} since {self.origin}",
555
+ "wk",
556
+ self.dt_ratio,
515
557
  pdf
516
558
  )
517
559
  # Close pdf file pointer
518
560
  pdf.close()
519
561
 
520
562
  return stats, regs
563
+
564
+