theseusplot 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- theseusplot-0.1.0/.github/workflows/ci.yml +41 -0
- theseusplot-0.1.0/.github/workflows/publish.yml +57 -0
- theseusplot-0.1.0/PKG-INFO +315 -0
- theseusplot-0.1.0/README-figures/factor_column-13.png +0 -0
- theseusplot-0.1.0/README-figures/overview-1.png +0 -0
- theseusplot-0.1.0/README-figures/plot_carrier-5.png +0 -0
- theseusplot-0.1.0/README-figures/plot_carrier_n-7.png +0 -0
- theseusplot-0.1.0/README-figures/plot_dep_delay-9.png +0 -0
- theseusplot-0.1.0/README-figures/plot_dep_delay_n-11.png +0 -0
- theseusplot-0.1.0/README-figures/plot_origin-3.png +0 -0
- theseusplot-0.1.0/README.Rmd +284 -0
- theseusplot-0.1.0/README.md +297 -0
- theseusplot-0.1.0/TheseusPlot_py.Rproj +13 -0
- theseusplot-0.1.0/pyproject.toml +54 -0
- theseusplot-0.1.0/src/theseusplot/__init__.py +11 -0
- theseusplot-0.1.0/src/theseusplot/_config.py +43 -0
- theseusplot-0.1.0/src/theseusplot/_ship.py +1224 -0
- theseusplot-0.1.0/src/theseusplot/py.typed +1 -0
- theseusplot-0.1.0/tests/fixtures/table_factor_order.json +76 -0
- theseusplot-0.1.0/tests/fixtures/table_missing_asymmetric_categories.json +80 -0
- theseusplot-0.1.0/tests/fixtures/table_top_n_aggregation.json +81 -0
- theseusplot-0.1.0/tests/test_continuous_binning.py +94 -0
- theseusplot-0.1.0/tests/test_contribution_algorithm.py +80 -0
- theseusplot-0.1.0/tests/test_fixtures.py +108 -0
- theseusplot-0.1.0/tests/test_plot.py +118 -0
- theseusplot-0.1.0/tests/test_public_api.py +105 -0
- theseusplot-0.1.0/tests/test_table_behavior.py +87 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: ["main"]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: ["main"]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
name: Python ${{ matrix.python-version }}
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
|
|
14
|
+
strategy:
|
|
15
|
+
fail-fast: false
|
|
16
|
+
matrix:
|
|
17
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
18
|
+
|
|
19
|
+
steps:
|
|
20
|
+
- name: Check out repository
|
|
21
|
+
uses: actions/checkout@v4
|
|
22
|
+
|
|
23
|
+
- name: Set up Python
|
|
24
|
+
uses: actions/setup-python@v5
|
|
25
|
+
with:
|
|
26
|
+
python-version: ${{ matrix.python-version }}
|
|
27
|
+
cache: pip
|
|
28
|
+
|
|
29
|
+
- name: Install package and development dependencies
|
|
30
|
+
run: |
|
|
31
|
+
python -m pip install --upgrade pip
|
|
32
|
+
python -m pip install -e ".[dev]"
|
|
33
|
+
|
|
34
|
+
- name: Lint
|
|
35
|
+
run: python -m ruff check .
|
|
36
|
+
|
|
37
|
+
- name: Type check
|
|
38
|
+
run: python -m mypy src
|
|
39
|
+
|
|
40
|
+
- name: Test
|
|
41
|
+
run: python -m pytest
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
name: Build distribution
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- name: Check out repository
|
|
15
|
+
uses: actions/checkout@v6
|
|
16
|
+
with:
|
|
17
|
+
persist-credentials: false
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v6
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.x"
|
|
23
|
+
|
|
24
|
+
- name: Install build dependencies
|
|
25
|
+
run: python -m pip install --upgrade build twine
|
|
26
|
+
|
|
27
|
+
- name: Build source and wheel distributions
|
|
28
|
+
run: python -m build
|
|
29
|
+
|
|
30
|
+
- name: Check distribution metadata
|
|
31
|
+
run: python -m twine check dist/*
|
|
32
|
+
|
|
33
|
+
- name: Store distribution packages
|
|
34
|
+
uses: actions/upload-artifact@v5
|
|
35
|
+
with:
|
|
36
|
+
name: python-package-distributions
|
|
37
|
+
path: dist/
|
|
38
|
+
|
|
39
|
+
publish:
|
|
40
|
+
name: Publish distribution to PyPI
|
|
41
|
+
needs: build
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
environment:
|
|
44
|
+
name: pypi
|
|
45
|
+
url: https://pypi.org/p/theseusplot
|
|
46
|
+
permissions:
|
|
47
|
+
id-token: write
|
|
48
|
+
|
|
49
|
+
steps:
|
|
50
|
+
- name: Download distribution packages
|
|
51
|
+
uses: actions/download-artifact@v6
|
|
52
|
+
with:
|
|
53
|
+
name: python-package-distributions
|
|
54
|
+
path: dist/
|
|
55
|
+
|
|
56
|
+
- name: Publish distribution to PyPI
|
|
57
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: theseusplot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python port of TheseusPlot for decomposing differences in rate metrics.
|
|
5
|
+
Author: TheseusPlot.py contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: matplotlib>=3.7
|
|
9
|
+
Requires-Dist: numpy>=1.24
|
|
10
|
+
Requires-Dist: pandas>=2.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
13
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
14
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
15
|
+
Provides-Extra: examples
|
|
16
|
+
Requires-Dist: nycflights13>=0.0.3; extra == 'examples'
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
<!-- README.md is generated from README.Rmd. Please edit that file -->
|
|
21
|
+
|
|
22
|
+
# TheseusPlot: Visualizing Decomposition of Differences in Rate Metrics
|
|
23
|
+
|
|
24
|
+
<!-- badges: start -->
|
|
25
|
+
|
|
26
|
+
[](https://github.com/hoxo-m/TheseusPlot_py/actions/workflows/ci.yml)
|
|
27
|
+
|
|
28
|
+
<!-- badges: end -->
|
|
29
|
+
|
|
30
|
+
## 1. Overview
|
|
31
|
+
|
|
32
|
+
In data analysis, when a metric differs between two groups, we sometimes
|
|
33
|
+
want to investigate whether a particular subgroup is driving that
|
|
34
|
+
difference. For example, when a key metric decline is detected compared
|
|
35
|
+
to the previous year, you may want to conduct a more detailed analysis.
|
|
36
|
+
In this analysis, you may focus on gender among the attributes and
|
|
37
|
+
examine whether the decline occurred among male, female, or both.
|
|
38
|
+
However, this type of analysis is challenging when the metric is a rate,
|
|
39
|
+
because the magnitude of each subgroup’s contribution to the rate cannot
|
|
40
|
+
be simply calculated, unlike in the case of volume metrics.
|
|
41
|
+
|
|
42
|
+
To address this issue, we propose an approach inspired by the story of
|
|
43
|
+
the *[Ship of Theseus](https://en.wikipedia.org/wiki/Ship_of_Theseus)*.
|
|
44
|
+
This approach involves gradually replacing the components of one group
|
|
45
|
+
with those of another, recalculating the metric at each step. The change
|
|
46
|
+
in the metric at each step can then be interpreted as the contribution
|
|
47
|
+
of each subgroup to the overall difference.
|
|
48
|
+
|
|
49
|
+
For instance, suppose the metric was 6.2% in 2024 and decreased to 5.2%
|
|
50
|
+
in 2025. Again, we focus on gender. We replace the male data within the
|
|
51
|
+
2024 dataset with the male data from 2025 and recalculate the metric. As
|
|
52
|
+
a result, the metric would drop by 0.8 percentage points, reaching 5.4%.
|
|
53
|
+
In this case, the contribution of the male group to the change in the
|
|
54
|
+
metric is -0.8 percentage points. Next, we replace the female data from
|
|
55
|
+
2024 with that from 2025. The dataset then consists entirely of 2025
|
|
56
|
+
data, and the metric drops by 0.2 percentage points, reaching 5.2%.
|
|
57
|
+
Thus, the contribution of the female group is -0.2 percentage points.
|
|
58
|
+
|
|
59
|
+
When visualized, the results appear as follows:
|
|
60
|
+
|
|
61
|
+
<img src="README-figures/overview-1.png" alt="" width="500" />
|
|
62
|
+
|
|
63
|
+
From this plot, we can see that the decline in the metric is primarily
|
|
64
|
+
driven by the male group. We call this visualization the “Theseus Plot.”
|
|
65
|
+
|
|
66
|
+
The **TheseusPlot** package is designed to make it easy to generate
|
|
67
|
+
Theseus Plots for various attributes.
|
|
68
|
+
|
|
69
|
+
## 2. Installation
|
|
70
|
+
|
|
71
|
+
You can install the development version from
|
|
72
|
+
[GitHub](https://github.com/hoxo-m/TheseusPlot_py) with:
|
|
73
|
+
|
|
74
|
+
``` bash
|
|
75
|
+
python -m pip install "git+https://github.com/hoxo-m/TheseusPlot_py.git"
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
You can install the optional dependencies for examples and documentation
|
|
79
|
+
data with:
|
|
80
|
+
|
|
81
|
+
``` bash
|
|
82
|
+
python -m pip install "theseusplot[examples] @ git+https://github.com/hoxo-m/TheseusPlot_py.git"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## 3. Details
|
|
86
|
+
|
|
87
|
+
### 3.1 Prepare Data
|
|
88
|
+
|
|
89
|
+
To create Theseus plots, you need two data frames that share common
|
|
90
|
+
columns.
|
|
91
|
+
|
|
92
|
+
We use the 2013 New York City flight data from
|
|
93
|
+
[nycflights13](https://cran.r-project.org/package=nycflights13) as a
|
|
94
|
+
demo dataset. Here, we will define the rate metric as the proportion of
|
|
95
|
+
flights that arrived on time. In December 2013, the on-time arrival rate
|
|
96
|
+
dropped substantially compared to November. We investigate the cause
|
|
97
|
+
using a Theseus plot.
|
|
98
|
+
|
|
99
|
+
First, we create an `on_time` column in the data frame to indicate
|
|
100
|
+
whether each flight arrived on time. Next, we extract the flights for
|
|
101
|
+
November and December into separate data frames to form two comparison
|
|
102
|
+
groups. The on-time arrival rate was 64% in November and dropped to 47%
|
|
103
|
+
in December.
|
|
104
|
+
|
|
105
|
+
``` python
|
|
106
|
+
from nycflights13 import airlines, flights
|
|
107
|
+
|
|
108
|
+
data = (
|
|
109
|
+
flights.dropna(subset=["arr_delay"])
|
|
110
|
+
.assign(on_time=lambda df: df["arr_delay"] <= 15)
|
|
111
|
+
.merge(airlines, on="carrier")
|
|
112
|
+
.assign(carrier=lambda df: df["name"])
|
|
113
|
+
.loc[
|
|
114
|
+
:,
|
|
115
|
+
[
|
|
116
|
+
"year",
|
|
117
|
+
"month",
|
|
118
|
+
"day",
|
|
119
|
+
"origin",
|
|
120
|
+
"dest",
|
|
121
|
+
"carrier",
|
|
122
|
+
"dep_delay",
|
|
123
|
+
"on_time",
|
|
124
|
+
],
|
|
125
|
+
]
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
print(data.head())
|
|
129
|
+
#> year month day origin dest carrier dep_delay on_time
|
|
130
|
+
#> 0 2013 1 1 EWR IAH United Air Lines Inc. 2.0 True
|
|
131
|
+
#> 1 2013 1 1 LGA IAH United Air Lines Inc. 4.0 False
|
|
132
|
+
#> 2 2013 1 1 JFK MIA American Airlines Inc. 2.0 False
|
|
133
|
+
#> 3 2013 1 1 JFK BQN JetBlue Airways -1.0 True
|
|
134
|
+
#> 4 2013 1 1 LGA ATL Delta Air Lines Inc. -6.0 True
|
|
135
|
+
|
|
136
|
+
data_nov = data[data["month"] == 11]
|
|
137
|
+
data_dec = data[data["month"] == 12]
|
|
138
|
+
|
|
139
|
+
print(data_nov["on_time"].mean())
|
|
140
|
+
#> 0.8264802936487339
|
|
141
|
+
print(data_dec["on_time"].mean())
|
|
142
|
+
#> 0.6738712065136936
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### 3.2 Basics
|
|
146
|
+
|
|
147
|
+
Using the two prepared data frames, we first create a `ship` object. The
|
|
148
|
+
`ship` object is an instance of the Python class `ShipOfTheseus`,
|
|
149
|
+
designed to create Theseus plots.
|
|
150
|
+
|
|
151
|
+
``` python
|
|
152
|
+
from theseusplot import create_ship
|
|
153
|
+
|
|
154
|
+
ship = create_ship(
|
|
155
|
+
data_nov,
|
|
156
|
+
data_dec,
|
|
157
|
+
y="on_time",
|
|
158
|
+
labels=("November", "December"),
|
|
159
|
+
)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
You can create a Theseus plot by passing column names to the `plot`
|
|
163
|
+
method of a `ship` object. For example, to create a Theseus plot for the
|
|
164
|
+
airport of origin:
|
|
165
|
+
|
|
166
|
+
``` python
|
|
167
|
+
fig, ax = ship.plot("origin")
|
|
168
|
+
fig.show()
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
<img src="README-figures/plot_origin-3.png" alt="" width="500" />
|
|
172
|
+
|
|
173
|
+
New York City has three major airports, and Newark Liberty International
|
|
174
|
+
Airport (EWR) accounted for the largest share of the decline in the
|
|
175
|
+
on-time arrival rate.
|
|
176
|
+
|
|
177
|
+
Note that the number of flights at each airport matters, as a larger
|
|
178
|
+
flight volume is expected to have a greater impact. To make this clear,
|
|
179
|
+
the Theseus plot displays the data size for each group within each
|
|
180
|
+
subgroup as a bar chart. From this, we see that the number of flights is
|
|
181
|
+
similar across airports, allowing for direct comparison of
|
|
182
|
+
contributions.
|
|
183
|
+
|
|
184
|
+
In summary, a Theseus plot consists of two components:
|
|
185
|
+
|
|
186
|
+
- A waterfall plot showing how much each subgroup contributed to the
|
|
187
|
+
change in the metric.
|
|
188
|
+
- A bar chart representing the sample size for each group within each
|
|
189
|
+
subgroup.
|
|
190
|
+
|
|
191
|
+
A `ship` object also provides the `table` method to inspect the exact
|
|
192
|
+
values used in the Theseus plot.
|
|
193
|
+
|
|
194
|
+
``` python
|
|
195
|
+
ship.table("origin")
|
|
196
|
+
#> origin contrib n1 n2 x1 x2 rate1 rate2
|
|
197
|
+
#> 0 EWR -0.071873 9603 9410 7995 5910 0.832552 0.628055
|
|
198
|
+
#> 1 JFK -0.050249 8645 8923 7290 6142 0.843262 0.688334
|
|
199
|
+
#> 2 LGA -0.030487 8723 8687 7006 6156 0.803164 0.708645
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### 3.3 Flipping the Plot
|
|
203
|
+
|
|
204
|
+
When there are many subgroups, a Theseus plot can become hard to read.
|
|
205
|
+
In such cases, you can swap the x- and y-axes for better visualization.
|
|
206
|
+
|
|
207
|
+
``` python
|
|
208
|
+
fig, ax = ship.plot_flip("carrier")
|
|
209
|
+
fig.show()
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
<img src="README-figures/plot_carrier-5.png" alt="" width="500" />
|
|
213
|
+
|
|
214
|
+
When the number of subgroups is large, those with small contributions
|
|
215
|
+
are automatically grouped together. By default, this happens when there
|
|
216
|
+
are more than 10 subgroups, but the threshold can be adjusted with the
|
|
217
|
+
`n` argument.
|
|
218
|
+
|
|
219
|
+
``` python
|
|
220
|
+
fig, ax = ship.plot_flip("carrier", n=6)
|
|
221
|
+
fig.show()
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
<img src="README-figures/plot_carrier_n-7.png" alt="" width="500" />
|
|
225
|
+
|
|
226
|
+
From this plot, JetBlue Airways and United Air Lines appear to have the
|
|
227
|
+
largest contributions to the decline in on-time arrival rate.
|
|
228
|
+
|
|
229
|
+
### 3.4 Automatic Discretization of Continuous Values
|
|
230
|
+
|
|
231
|
+
Theseus plots do not directly support continuous variables. If a
|
|
232
|
+
continuous column is provided, it is automatically discretized. For
|
|
233
|
+
example, we can create a Theseus plot for departure delays.
|
|
234
|
+
|
|
235
|
+
``` python
|
|
236
|
+
fig, ax = ship.plot_flip("dep_delay")
|
|
237
|
+
fig.show()
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
<img src="README-figures/plot_dep_delay-9.png" alt="" width="500" />
|
|
241
|
+
|
|
242
|
+
By default, continuous variables are discretized so that each subgroup
|
|
243
|
+
has roughly equal sample sizes, with the number of bins set to 10. You
|
|
244
|
+
can modify these settings by passing the return value of
|
|
245
|
+
`continuous_config()` to the `continuous` argument.
|
|
246
|
+
|
|
247
|
+
``` python
|
|
248
|
+
from theseusplot import continuous_config
|
|
249
|
+
|
|
250
|
+
fig, ax = ship.plot_flip("dep_delay", continuous=continuous_config(n=3))
|
|
251
|
+
fig.show()
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
<img src="README-figures/plot_dep_delay_n-11.png" alt="" width="500" />
|
|
255
|
+
|
|
256
|
+
This result shows that both a decrease in on-time departures and an
|
|
257
|
+
increase in delayed departures contributed to the decline in on-time
|
|
258
|
+
arrival rate.
|
|
259
|
+
|
|
260
|
+
### 3.5 Ordering for Factor Columns
|
|
261
|
+
|
|
262
|
+
If a subgroup column is categorical, `table()` and `plot()` respect its
|
|
263
|
+
category order. This is useful when you want to keep a meaningful
|
|
264
|
+
predefined order, such as `"Low"`, `"Medium"`, and `"High"`, instead of
|
|
265
|
+
ordering categories by their contributions.
|
|
266
|
+
|
|
267
|
+
``` python
|
|
268
|
+
import pandas as pd
|
|
269
|
+
from pandas.api.types import CategoricalDtype
|
|
270
|
+
|
|
271
|
+
segment_type = CategoricalDtype(
|
|
272
|
+
categories=["Low", "Medium", "High"],
|
|
273
|
+
ordered=True,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
data1 = pd.DataFrame(
|
|
277
|
+
{
|
|
278
|
+
"segment": pd.Series(
|
|
279
|
+
["Low", "Low", "Medium", "Medium", "High", "High"],
|
|
280
|
+
dtype=segment_type,
|
|
281
|
+
),
|
|
282
|
+
"y": [1, 1, 1, 0, 1, 1],
|
|
283
|
+
}
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
data2 = pd.DataFrame(
|
|
287
|
+
{
|
|
288
|
+
"segment": pd.Series(
|
|
289
|
+
["Low", "Low", "Medium", "Medium", "High", "High"],
|
|
290
|
+
dtype=segment_type,
|
|
291
|
+
),
|
|
292
|
+
"y": [1, 0, 1, 1, 0, 0],
|
|
293
|
+
}
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
ship = create_ship(data1, data2, y="y", labels=("Group 1", "Group 2"))
|
|
297
|
+
|
|
298
|
+
print(ship.table("segment"))
|
|
299
|
+
#> segment contrib n1 n2 x1 x2 rate1 rate2
|
|
300
|
+
#> 0 Low -0.166667 2 2 2 1 1.0 0.5
|
|
301
|
+
#> 1 Medium 0.166667 2 2 1 2 0.5 1.0
|
|
302
|
+
#> 2 High -0.333333 2 2 2 0 1.0 0.0
|
|
303
|
+
|
|
304
|
+
fig, ax = ship.plot("segment")
|
|
305
|
+
fig.show()
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
<img src="README-figures/factor_column-13.png" alt="" width="500" />
|
|
309
|
+
|
|
310
|
+
Even if the contribution of `"High"` is larger than that of `"Low"` or
|
|
311
|
+
`"Medium"`, the rows and bars are shown in the order
|
|
312
|
+
`"Low" -> "Medium" -> "High"` because `segment` is categorical.
|
|
313
|
+
|
|
314
|
+
By contrast, if `segment` were an object column, the output would be
|
|
315
|
+
ordered by contribution rather than by a predefined level order.
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|