copulas 0.9.2.dev0__py2.py3-none-any.whl → 0.10.0.dev0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of copulas might be problematic. Click here for more details.

copulas/__init__.py CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  __author__ = 'DataCebo, Inc.'
6
6
  __email__ = 'info@sdv.dev'
7
- __version__ = '0.9.2.dev0'
7
+ __version__ = '0.10.0.dev0'
8
8
 
9
9
  import contextlib
10
10
  import importlib
copulas/visualization.py CHANGED
@@ -1,136 +1,320 @@
1
1
  """Visualization utilities for the Copulas library."""
2
2
 
3
3
  import pandas as pd
4
+ import plotly.express as px
5
+ import plotly.figure_factory as ff
4
6
 
5
- try:
6
- import matplotlib.pyplot as plt
7
- except RuntimeError as e:
8
- if 'Python is not installed as a framework.' in e.message:
9
- import matplotlib
10
- matplotlib.use('PS') # Avoid crash on macos
11
- import matplotlib.pyplot as plt
12
7
 
8
+ class PlotConfig:
9
+ """Custom plot settings for visualizations."""
13
10
 
14
- def scatter_3d(data, columns=None, fig=None, title=None, position=None):
15
- """Plot 3 dimensional data in a scatter plot."""
16
- fig = fig or plt.figure()
17
- position = position or 111
11
+ DATACEBO_DARK = '#000036'
12
+ DATACEBO_GREEN = '#01E0C9'
13
+ BACKGROUND_COLOR = '#F5F5F8'
14
+ FONT_SIZE = 18
18
15
 
19
- ax = fig.add_subplot(position, projection='3d')
20
- ax.scatter(*(
21
- data[column]
22
- for column in columns or data.columns
23
- ))
24
- if title:
25
- ax.set_title(title)
26
- ax.title.set_position([.5, 1.05])
27
16
 
28
- return ax
17
+ def _generate_1d_plot(data, title, labels, colors):
18
+ """Generate a density plot of an array-like structure.
29
19
 
20
+ Args:
21
+ data (array-like structure):
22
+ The data to plot.
23
+ title (str):
24
+ The title of the plot.
25
+ labels (list[str]):
26
+ The labels of the data.
27
+ colors (list[str]):
28
+ The colors of the data.
29
+
30
+ Returns:
31
+ plotly.graph_objects._figure.Figure
32
+ """
33
+ fig = ff.create_distplot(
34
+ hist_data=data,
35
+ group_labels=labels,
36
+ show_hist=False,
37
+ show_rug=False,
38
+ colors=colors
39
+ )
40
+
41
+ for i, name in enumerate(labels):
42
+ fig.update_traces(
43
+ x=fig.data[i].x,
44
+ hovertemplate=f'<b>{name}</b><br>Frequency: %{{y}}<extra></extra>',
45
+ selector={'name': name},
46
+ fill='tozeroy',
47
+ )
48
+
49
+ fig.update_layout(
50
+ title=title,
51
+ plot_bgcolor=PlotConfig.BACKGROUND_COLOR,
52
+ font={'size': PlotConfig.FONT_SIZE},
53
+ showlegend=True if labels[0] else False,
54
+ xaxis_title='value',
55
+ yaxis_title='frequency'
56
+ )
57
+
58
+ return fig
59
+
60
+
61
+ def dist_1d(data, title=None, label=None):
62
+ """Plot the 1 dimensional data.
30
63
 
31
- def scatter_2d(data, columns=None, fig=None, title=None, position=None):
32
- """Plot 2 dimensional data in a scatter plot."""
33
- fig = fig or plt.figure()
34
- position = position or 111
64
+ Args:
65
+ data (array_like structure):
66
+ The table data.
67
+ title (str):
68
+ The title of the plot.
69
+ label (str):
70
+ The label of the plot.
71
+
72
+ Returns:
73
+ plotly.graph_objects._figure.Figure
74
+ """
75
+ if not title:
76
+ title = 'Data'
77
+ if isinstance(data, pd.DataFrame):
78
+ title += f" for column '{data.columns[0]}'"
79
+ elif isinstance(data, pd.Series) and data.name:
80
+ title += f" for column '{data.name}'"
35
81
 
36
- ax = fig.add_subplot(position)
37
- columns = columns or data.columns
38
- if len(columns) != 2:
39
- raise ValueError('Only 2 columns can be plotted')
82
+ return _generate_1d_plot(
83
+ data=[data],
84
+ title=title,
85
+ labels=[label],
86
+ colors=[PlotConfig.DATACEBO_DARK]
87
+ )
40
88
 
41
- x, y = columns
42
89
 
43
- ax.scatter(data[x], data[y])
44
- plt.xlabel(x)
45
- plt.ylabel(y)
90
+ def compare_1d(real, synth, title=None):
91
+ """Plot the comparison between real and synthetic data.
92
+
93
+ Args:
94
+ real (array_like):
95
+ The real data.
96
+ synth (array_like):
97
+ The synthetic data.
98
+ title (str):
99
+ The title of the plot.
100
+
101
+ Returns:
102
+ plotly.graph_objects._figure.Figure
103
+ """
104
+ if not title:
105
+ title = 'Real vs. Synthetic Data'
106
+ if isinstance(real, pd.DataFrame):
107
+ title += f" for column '{real.columns[0]}'"
108
+ elif isinstance(real, pd.Series) and real.name:
109
+ title += f" for column '{real.name}'"
46
110
 
47
- if title:
48
- ax.set_title(title)
49
- ax.title.set_position([.5, 1.05])
111
+ return _generate_1d_plot(
112
+ data=[real, synth],
113
+ title=title,
114
+ labels=['Real', 'Synthetic'],
115
+ colors=[PlotConfig.DATACEBO_DARK, PlotConfig.DATACEBO_GREEN]
116
+ )
50
117
 
51
- return ax
52
118
 
119
+ def _generate_scatter_2d_plot(data, columns, color_discrete_map, title):
120
+ """Generate a scatter plot for a pair of columns.
53
121
 
54
- def hist_1d(data, fig=None, title=None, position=None, bins=20, label=None):
55
- """Plot 1 dimensional data in a histogram."""
56
- fig = fig or plt.figure()
57
- position = position or 111
122
+ Args:
123
+ data (pandas.DataFrame):
124
+ The data for the desired column pair containing a
125
+ ``Data`` column indicating whether it is real or synthetic.
126
+ columns (list):
127
+ A list of the columns being plotted.
128
+ color_discrete_map (dict):
129
+ A dictionary mapping the values of the ``Data`` column to the colors
130
+ used to plot them.
131
+ title (str):
132
+ The title of the plot.
133
+
134
+ Returns:
135
+ plotly.graph_objects._figure.Figure
136
+ """
137
+ if columns:
138
+ columns.append('Data')
139
+ else:
140
+ columns = data.columns
58
141
 
59
- ax = fig.add_subplot(position)
60
- ax.hist(data, density=True, bins=bins, alpha=0.8, label=label)
142
+ if len(columns) != 3: # includes the 'Data' column
143
+ raise ValueError('Only 2 columns can be plotted')
61
144
 
62
- if label:
63
- ax.legend()
145
+ fig = px.scatter(
146
+ data,
147
+ x=columns[0],
148
+ y=columns[1],
149
+ color='Data',
150
+ color_discrete_map=color_discrete_map,
151
+ symbol='Data'
152
+ )
64
153
 
65
- if title:
66
- ax.set_title(title)
67
- ax.title.set_position([.5, 1.05])
154
+ fig.update_layout(
155
+ title=title,
156
+ plot_bgcolor=PlotConfig.BACKGROUND_COLOR,
157
+ font={'size': PlotConfig.FONT_SIZE},
158
+ showlegend=False if len(color_discrete_map) == 1 else True,
159
+ )
68
160
 
69
- return ax
161
+ return fig
70
162
 
71
163
 
72
- def side_by_side(plotting_func, arrays):
73
- """Plot side-by-side figures.
164
+ def scatter_2d(data, columns=None, title=None):
165
+ """Plot 2 dimensional data in a scatter plot.
74
166
 
75
167
  Args:
76
- plotting_func (callable):
77
- A matplotlib function which takes in the standard plot kwargs.
78
- arrays (dict[str, np.ndarray]):
79
- A mapping from the name of the subplot to the values.
168
+ data (pandas.DataFrame):
169
+ The table data.
170
+ columns (list[string]):
171
+ The names of the two columns to plot.
172
+ title (str):
173
+ The title of the plot.
174
+
175
+ Returns:
176
+ plotly.graph_objects._figure.Figure
80
177
  """
81
- fig = plt.figure(figsize=(10, 4))
178
+ data = data.copy()
179
+ data['Data'] = 'Real'
82
180
 
83
- position_base = f'1{len(arrays)}'
84
- for index, (title, array) in enumerate(arrays.items()):
85
- position = int(position_base + str(index + 1))
86
- plotting_func(array, fig=fig, title=title, position=position)
181
+ if not title:
182
+ title = 'Data'
183
+ if columns:
184
+ title += f" for columns '{columns[0]}' and '{columns[1]}'"
185
+ elif isinstance(data, pd.DataFrame):
186
+ title += f" for columns '{data.columns[0]}' and '{data.columns[1]}'"
87
187
 
88
- plt.tight_layout()
188
+ return _generate_scatter_2d_plot(
189
+ data=data,
190
+ columns=columns,
191
+ color_discrete_map={'Real': PlotConfig.DATACEBO_DARK},
192
+ title=title
193
+ )
89
194
 
90
195
 
91
- def compare_3d(real, synth, columns=None, figsize=(10, 4)):
92
- """Generate a 3d scatter plot comparing real/synthetic data.
196
+ def compare_2d(real, synth, columns=None, title=None):
197
+ """Plot the comparison between real and synthetic data for a given column pair.
93
198
 
94
199
  Args:
95
- real (pd.DataFrame):
96
- The real data.
97
- synth (pd.DataFrame):
98
- The synthetic data.
200
+ real (pandas.DataFrame):
201
+ The real table data.
202
+ synth (pandas.Dataframe):
203
+ The synthetic table data.
204
+ columns (list[string]):
205
+ The names of the two columns to plot.
206
+ title (str):
207
+ The title of the plot.
208
+
209
+ Returns:
210
+ plotly.graph_objects._figure.Figure
211
+ """
212
+ real, synth = real.copy(), synth.copy()
213
+ real['Data'] = 'Real'
214
+ synth['Data'] = 'Synthetic'
215
+ data = pd.concat([real, synth], axis=0, ignore_index=True)
216
+
217
+ if not title:
218
+ title = 'Real vs. Synthetic Data'
219
+ if columns:
220
+ title += f" for columns '{columns[0]}' and '{columns[1]}'"
221
+ elif isinstance(data, pd.DataFrame):
222
+ title += f" for columns '{data.columns[0]}' and '{data.columns[1]}'"
223
+
224
+ return _generate_scatter_2d_plot(
225
+ data=data,
226
+ columns=columns,
227
+ color_discrete_map={
228
+ 'Real': PlotConfig.DATACEBO_DARK,
229
+ 'Synthetic': PlotConfig.DATACEBO_GREEN
230
+ },
231
+ title=title
232
+ )
233
+
234
+
235
+ def _generate_scatter_3d_plot(data, columns, color_discrete_map, title):
236
+ """Generate a scatter plot for column pair plot.
237
+
238
+ Args:
239
+ data (pandas.DataFrame):
240
+ The data for the desired three columns containing a
241
+ ``Data`` column that indicates whether it is real or synthetic.
99
242
  columns (list):
100
- The name of the columns to plot.
101
- figsize:
102
- Figure size, passed to matplotlib.
243
+ A list of the columns being plotted.
244
+ color_discrete_map (dict):
245
+ A dictionary mapping the values of the ``Data`` column to the colors
246
+ used to plot them.
247
+ title (str):
248
+ The title of the plot.
249
+
250
+ Returns:
251
+ plotly.graph_objects._figure.Figure
103
252
  """
104
- columns = columns or real.columns
105
- fig = plt.figure(figsize=figsize)
253
+ if columns:
254
+ columns.append('Data')
255
+ else:
256
+ columns = data.columns
257
+
258
+ if len(columns) != 4: # includes the 'Data' column
259
+ raise ValueError('Only 3 columns can be plotted')
260
+
261
+ fig = px.scatter_3d(
262
+ data,
263
+ x=columns[0],
264
+ y=columns[1],
265
+ z=columns[2],
266
+ color='Data',
267
+ color_discrete_map=color_discrete_map,
268
+ symbol='Data',
269
+ )
106
270
 
107
- scatter_3d(real[columns], fig=fig, title='Real Data', position=121)
108
- scatter_3d(synth[columns], fig=fig, title='Synthetic Data', position=122)
271
+ fig.update_traces(marker={'size': 5})
109
272
 
110
- plt.tight_layout()
273
+ fig.update_layout(
274
+ title=title,
275
+ plot_bgcolor=PlotConfig.BACKGROUND_COLOR,
276
+ font={'size': PlotConfig.FONT_SIZE},
277
+ showlegend=False if len(color_discrete_map) == 1 else True,
278
+ )
111
279
 
280
+ return fig
112
281
 
113
- def compare_2d(real, synth, columns=None, figsize=None):
114
- """Generate a 2d scatter plot comparing real/synthetic data.
282
+
283
+ def scatter_3d(data, columns=None, title=None):
284
+ """Plot 3 dimensional data in a scatter plot.
115
285
 
116
286
  Args:
117
- real (pd.DataFrame):
118
- The real data.
119
- synth (pd.DataFrame):
120
- The synthetic data.
121
- columns (list):
122
- The name of the columns to plot.
123
- figsize:
124
- Figure size, passed to matplotlib.
287
+ data (pandas.DataFrame):
288
+ The table data. Must have at least 3 columns.
289
+ columns (list[string]):
290
+ The names of the three columns to plot.
291
+ title (str):
292
+ The title of the plot.
293
+
294
+ Returns:
295
+ plotly.graph_objects._figure.Figure
125
296
  """
126
- x, y = columns or real.columns
127
- ax = real.plot.scatter(x, y, color='blue', alpha=0.5, figsize=figsize)
128
- ax = synth.plot.scatter(x, y, ax=ax, color='orange', alpha=0.5, figsize=figsize)
129
- ax.legend(['Real', 'Synthetic'])
297
+ data = data.copy()
298
+ data['Data'] = 'Real'
299
+
300
+ if not title:
301
+ title = 'Data'
302
+ if columns:
303
+ title += f" for columns '{columns[0]}', '{columns[1]}' and '{columns[2]}'"
304
+ elif isinstance(data, pd.DataFrame):
305
+ title += \
306
+ f" for columns '{data.columns[0]}', '{data.columns[1]}' and '{data.columns[2]}'"
130
307
 
308
+ return _generate_scatter_3d_plot(
309
+ data=data,
310
+ columns=columns,
311
+ color_discrete_map={'Real': PlotConfig.DATACEBO_DARK},
312
+ title=title
313
+ )
131
314
 
132
- def compare_1d(real, synth, columns=None, figsize=None):
133
- """Generate a 1d scatter plot comparing real/synthetic data.
315
+
316
+ def compare_3d(real, synth, columns=None, title=None):
317
+ """Plot the comparison between real and synthetic data for a given column triplet.
134
318
 
135
319
  Args:
136
320
  real (pd.DataFrame):
@@ -139,26 +323,28 @@ def compare_1d(real, synth, columns=None, figsize=None):
139
323
  The synthetic data.
140
324
  columns (list):
141
325
  The name of the columns to plot.
142
- figsize:
143
- Figure size, passed to matplotlib.
326
+ title (str):
327
+ The title of the plot.
144
328
  """
145
- if len(real.shape) == 1:
146
- real = pd.DataFrame({'': real})
147
- synth = pd.DataFrame({'': synth})
148
-
149
- columns = columns or real.columns
150
-
151
- num_cols = len(columns)
152
- fig_cols = min(2, num_cols)
153
- fig_rows = (num_cols // fig_cols) + 1
154
- prefix = f'{fig_rows}{fig_cols}'
155
-
156
- figsize = figsize or (5 * fig_cols, 3 * fig_rows)
157
- fig = plt.figure(figsize=figsize)
158
-
159
- for idx, column in enumerate(columns):
160
- position = int(prefix + str(idx + 1))
161
- hist_1d(real[column], fig=fig, position=position, title=column, label='Real')
162
- hist_1d(synth[column], fig=fig, position=position, title=column, label='Synthetic')
163
-
164
- plt.tight_layout()
329
+ real, synth = real.copy(), synth.copy()
330
+ real['Data'] = 'Real'
331
+ synth['Data'] = 'Synthetic'
332
+ data = pd.concat([real, synth], axis=0, ignore_index=True)
333
+
334
+ if not title:
335
+ title = 'Real vs. Synthetic Data'
336
+ if columns:
337
+ title += f" for columns '{columns[0]}', '{columns[1]}' and '{columns[2]}'"
338
+ elif isinstance(data, pd.DataFrame):
339
+ title += \
340
+ f" for columns '{data.columns[0]}', '{data.columns[1]}' and '{data.columns[2]}'"
341
+
342
+ return _generate_scatter_3d_plot(
343
+ data=data,
344
+ columns=columns,
345
+ color_discrete_map={
346
+ 'Real': PlotConfig.DATACEBO_DARK,
347
+ 'Synthetic': PlotConfig.DATACEBO_GREEN
348
+ },
349
+ title=title
350
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: copulas
3
- Version: 0.9.2.dev0
3
+ Version: 0.10.0.dev0
4
4
  Summary: Create tabular synthetic data using copulas-based modeling.
5
5
  Home-page: https://github.com/sdv-dev/Copulas
6
6
  Author: DataCebo, Inc.
@@ -21,11 +21,10 @@ Requires-Python: >=3.8,<3.12
21
21
  Description-Content-Type: text/markdown
22
22
  License-File: LICENSE
23
23
  License-File: AUTHORS.rst
24
- Requires-Dist: matplotlib <4,>=3.4.0 ; python_version < "3.10"
24
+ Requires-Dist: plotly <6,>=5.10.0
25
25
  Requires-Dist: numpy <2,>=1.20.0 ; python_version < "3.10"
26
26
  Requires-Dist: pandas >=1.1.3 ; python_version < "3.10"
27
27
  Requires-Dist: scipy <2,>=1.5.4 ; python_version < "3.10"
28
- Requires-Dist: matplotlib <4,>=3.6.0 ; python_version >= "3.10"
29
28
  Requires-Dist: numpy <2,>=1.23.3 ; python_version >= "3.10"
30
29
  Requires-Dist: scipy <2,>=1.9.2 ; python_version >= "3.10"
31
30
  Requires-Dist: pandas >=1.3.4 ; python_version >= "3.10" and python_version < "3.11"
@@ -230,6 +229,19 @@ for specific needs.
230
229
 
231
230
  # History
232
231
 
232
+ ## v0.9.2 - 2023-10-12
233
+
234
+ This release removes a warning that was being raised when univariate distributions failed to fit and logs the message instead.
235
+
236
+ ### New Features
237
+
238
+ * When Copulas univariate fit fails, produce a log instead of a warning - Issue [#359](https://github.com/sdv-dev/Copulas/issues/359) by @R-Palazzo
239
+
240
+ ### Maintenance
241
+
242
+ * Switch default branch from master to main - Issue [#360](https://github.com/sdv-dev/Copulas/issues/360) by @amontanez24
243
+ * Update add-on detection for Copulas - Issue [#362](https://github.com/sdv-dev/Copulas/issues/362) by @pvk-developer
244
+
233
245
  ## v0.9.1 - 2023-08-10
234
246
 
235
247
  This release fixes problems with the documentation site and drops support for Python 3.7.
@@ -1,6 +1,6 @@
1
- copulas/__init__.py,sha256=o-qZcJYQLxfaEO9lFOF48pUwK-hw_1nSXJdMbtt2XyU,9285
1
+ copulas/__init__.py,sha256=Xdj9tDeUWcwL-oNXOxeenA1w-c9KqQEWZZA8COay2Bk,9286
2
2
  copulas/datasets.py,sha256=KMNCJXcOOMp28xML-Q_wQHwrpflRnR9Kkcxre-ubG9A,6831
3
- copulas/visualization.py,sha256=M3ZkLIxSXsbh8rZ2LEA5Tr6Ow1k-TYnJqq8HOr8xHrE,4665
3
+ copulas/visualization.py,sha256=f4wIkJ_AKrvBtrDGoNP0lofO75KL2JrHKhtLOz-GPkA,9707
4
4
  copulas/bivariate/__init__.py,sha256=dn4sz2B0Nqt6p4eJG6CpHYhtvLQcZekxlmj5Kd0zcvo,5087
5
5
  copulas/bivariate/base.py,sha256=t1g_c_TgIMkF-gp2wsh6R5jaLb7IE-GpYYlvNFjiYQ4,13952
6
6
  copulas/bivariate/clayton.py,sha256=Tj2DGsycC7pfoSGTtG_f_o33snvPqaSbVIlxrll-G_Q,4536
@@ -25,9 +25,9 @@ copulas/univariate/selection.py,sha256=uC-l8osnbx50Gqx4-WLfKTLco0ncb41TDEbdt1hp_
25
25
  copulas/univariate/student_t.py,sha256=2CqIECLk4-rtFUASn79lUk8EHvIyIqjgRo1h2xEDpEk,823
26
26
  copulas/univariate/truncated_gaussian.py,sha256=OkuD0YyjtPY7b2mpjqy2lgrOPwlJffAebfTVj8K6bZY,1932
27
27
  copulas/univariate/uniform.py,sha256=Jou_lxzDHc2BMNIJ7iLh5ffw1xJvBIiAAd2r3k_Mqes,754
28
- copulas-0.9.2.dev0.dist-info/AUTHORS.rst,sha256=8Xis8XHO9jCJXBVc6_MItcHhcWF3gG2tlIRUs_L8DS0,77
29
- copulas-0.9.2.dev0.dist-info/LICENSE,sha256=cORU2kpIo9Qyy7Kv2ZpYDIIcksrjqlNEL9c9Ic1ayo0,4822
30
- copulas-0.9.2.dev0.dist-info/METADATA,sha256=SKj4VUPsn3CEvL4jgNJp0CbgIMH58FGUlrkoLEA-79I,20754
31
- copulas-0.9.2.dev0.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
32
- copulas-0.9.2.dev0.dist-info/top_level.txt,sha256=xNXWuWoZ-U3Gb734WqQxkF5RIeGDVU3IstjD-RnWsk8,8
33
- copulas-0.9.2.dev0.dist-info/RECORD,,
28
+ copulas-0.10.0.dev0.dist-info/AUTHORS.rst,sha256=8Xis8XHO9jCJXBVc6_MItcHhcWF3gG2tlIRUs_L8DS0,77
29
+ copulas-0.10.0.dev0.dist-info/LICENSE,sha256=cORU2kpIo9Qyy7Kv2ZpYDIIcksrjqlNEL9c9Ic1ayo0,4822
30
+ copulas-0.10.0.dev0.dist-info/METADATA,sha256=FaniACxX9B-ok5BwI9eS9w1RyaKeUv5Egs9DQxfPIFs,21239
31
+ copulas-0.10.0.dev0.dist-info/WHEEL,sha256=P2T-6epvtXQ2cBOE_U1K4_noqlJFN3tj15djMgEu4NM,110
32
+ copulas-0.10.0.dev0.dist-info/top_level.txt,sha256=xNXWuWoZ-U3Gb734WqQxkF5RIeGDVU3IstjD-RnWsk8,8
33
+ copulas-0.10.0.dev0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.2)
2
+ Generator: bdist_wheel (0.41.3)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any