sequential-importance-sampling 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sequential_importance_sampling-0.1.0/LICENSE +21 -0
- sequential_importance_sampling-0.1.0/PKG-INFO +92 -0
- sequential_importance_sampling-0.1.0/README.md +74 -0
- sequential_importance_sampling-0.1.0/pyproject.toml +36 -0
- sequential_importance_sampling-0.1.0/setup.cfg +4 -0
- sequential_importance_sampling-0.1.0/src/sequential_importance_sampling.egg-info/PKG-INFO +92 -0
- sequential_importance_sampling-0.1.0/src/sequential_importance_sampling.egg-info/SOURCES.txt +11 -0
- sequential_importance_sampling-0.1.0/src/sequential_importance_sampling.egg-info/dependency_links.txt +1 -0
- sequential_importance_sampling-0.1.0/src/sequential_importance_sampling.egg-info/requires.txt +5 -0
- sequential_importance_sampling-0.1.0/src/sequential_importance_sampling.egg-info/top_level.txt +1 -0
- sequential_importance_sampling-0.1.0/src/sequential_importance_sampling.py +204 -0
- sequential_importance_sampling-0.1.0/tests/test_diaconis_gangolli.py +45 -0
- sequential_importance_sampling-0.1.0/tests/test_holmes_jones.py +50 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 gfrt0
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sequential-importance-sampling
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Sequential importance sampling for contingency tables with fixed margins
|
|
5
|
+
Author: gfrt0
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/gfrt0/sequential-importance-sampling
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: numba
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest; extra == "dev"
|
|
17
|
+
Dynamic: license-file
|
|
18
|
+
|
|
19
|
+
# Sequential Importance Sampling for Contingency Tables
|
|
20
|
+
|
|
21
|
+
Numba-accelerated sequential importance sampling (SIS) for uniformly sampling **two-way** contingency tables with fixed row and column margins, following **Chen, Diaconis, Holmes, and Liu (2005, JASA)**.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install -e .
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import numpy as np
|
|
33
|
+
from sequential_importance_sampling import sample_tables
|
|
34
|
+
|
|
35
|
+
# Row and column margins from Chen et al. (2005)
|
|
36
|
+
row_sums = np.array([10, 62, 13, 11, 39])
|
|
37
|
+
col_sums = np.array([65, 25, 45])
|
|
38
|
+
|
|
39
|
+
# Draw 150,000 importance-weighted tables
|
|
40
|
+
tables, logq = sample_tables(row_sums, col_sums, num_samples=150_000, rng_seed=44042)
|
|
41
|
+
|
|
42
|
+
# Estimate the number of tables with these margins
|
|
43
|
+
# True value: 239,382,173 (Diaconis and Gangolli, 1995)
|
|
44
|
+
print("Estimate:", np.exp(-logq).mean())
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
**Batch dimensions** are supported for sampling many independent two-way tables in parallel. If `row_sums` has shape `(R, d1, d2, ...)` and `col_sums` has shape `(C, d1, d2, ...)`, each combination of batch indices defines an independent R×C table problem. The output `tables` will have shape `(num_samples, d1, d2, ..., R, C)` with batch dims before table dims for C-contiguous access. Note that this does **not** extend to multi-way (3+) contingency tables with additional margin constraints — each batch element is a separate two-way table.
|
|
48
|
+
|
|
49
|
+
## API
|
|
50
|
+
|
|
51
|
+
| Function | Description |
|
|
52
|
+
|---|---|
|
|
53
|
+
| `sample_tables` | Main entry point — sample batches of tables with arbitrary batch dimensions |
|
|
54
|
+
| `sample_table_sis` | Sample a single contingency table via column-wise SIS |
|
|
55
|
+
|
|
56
|
+
`sample_tables` returns `(tables, logq)` where `logq` contains the log importance weights. Pass `parallel=False` to disable numba parallelism.
|
|
57
|
+
|
|
58
|
+
## Tests
|
|
59
|
+
|
|
60
|
+
Run the Diaconis-Gangolli counting verification:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
python tests/test_diaconis_gangolli.py
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
True count: 239,382,173
|
|
68
|
+
SIS estimate (n=150k): 239,413,201
|
|
69
|
+
Coefficient of variation: 0.9512
|
|
70
|
+
Effective sample size: 78,750
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Run the Holmes-Jones example from the Chen et al. paper:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
python tests/test_holmes_jones.py
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Reference (Chen et al.): 3.383e16
|
|
81
|
+
SIS estimate (n=1M): 3.382e+16
|
|
82
|
+
log10 estimate: 16.5291
|
|
83
|
+
log10 reference: 16.5293
|
|
84
|
+
Coefficient of variation: 1.0537
|
|
85
|
+
Effective sample size: 473,875
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
## References
|
|
90
|
+
|
|
91
|
+
- Chen, Y., Diaconis, P., Holmes, S. P., & Liu, J. S. (2005). Sequential Monte Carlo methods for statistical analysis of tables. *Journal of the American Statistical Association*, 100(469), 109-120.
|
|
92
|
+
- Diaconis, P., & Gangolli, A. (1995). Rectangular arrays with fixed margins. In *Discrete Probability and Algorithms* (pp. 15-41). Springer.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Sequential Importance Sampling for Contingency Tables
|
|
2
|
+
|
|
3
|
+
Numba-accelerated sequential importance sampling (SIS) for uniformly sampling **two-way** contingency tables with fixed row and column margins, following **Chen, Diaconis, Holmes, and Liu (2005, JASA)**.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install -e .
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
import numpy as np
|
|
15
|
+
from sequential_importance_sampling import sample_tables
|
|
16
|
+
|
|
17
|
+
# Row and column margins from Chen et al. (2005)
|
|
18
|
+
row_sums = np.array([10, 62, 13, 11, 39])
|
|
19
|
+
col_sums = np.array([65, 25, 45])
|
|
20
|
+
|
|
21
|
+
# Draw 150,000 importance-weighted tables
|
|
22
|
+
tables, logq = sample_tables(row_sums, col_sums, num_samples=150_000, rng_seed=44042)
|
|
23
|
+
|
|
24
|
+
# Estimate the number of tables with these margins
|
|
25
|
+
# True value: 239,382,173 (Diaconis and Gangolli, 1995)
|
|
26
|
+
print("Estimate:", np.exp(-logq).mean())
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**Batch dimensions** are supported for sampling many independent two-way tables in parallel. If `row_sums` has shape `(R, d1, d2, ...)` and `col_sums` has shape `(C, d1, d2, ...)`, each combination of batch indices defines an independent R×C table problem. The output `tables` will have shape `(num_samples, d1, d2, ..., R, C)` with batch dims before table dims for C-contiguous access. Note that this does **not** extend to multi-way (3+) contingency tables with additional margin constraints — each batch element is a separate two-way table.
|
|
30
|
+
|
|
31
|
+
## API
|
|
32
|
+
|
|
33
|
+
| Function | Description |
|
|
34
|
+
|---|---|
|
|
35
|
+
| `sample_tables` | Main entry point — sample batches of tables with arbitrary batch dimensions |
|
|
36
|
+
| `sample_table_sis` | Sample a single contingency table via column-wise SIS |
|
|
37
|
+
|
|
38
|
+
`sample_tables` returns `(tables, logq)` where `logq` contains the log importance weights. Pass `parallel=False` to disable numba parallelism.
|
|
39
|
+
|
|
40
|
+
## Tests
|
|
41
|
+
|
|
42
|
+
Run the Diaconis-Gangolli counting verification:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
python tests/test_diaconis_gangolli.py
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
True count: 239,382,173
|
|
50
|
+
SIS estimate (n=150k): 239,413,201
|
|
51
|
+
Coefficient of variation: 0.9512
|
|
52
|
+
Effective sample size: 78,750
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Run the Holmes-Jones example from the Chen et al. paper:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
python tests/test_holmes_jones.py
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
Reference (Chen et al.): 3.383e16
|
|
63
|
+
SIS estimate (n=1M): 3.382e+16
|
|
64
|
+
log10 estimate: 16.5291
|
|
65
|
+
log10 reference: 16.5293
|
|
66
|
+
Coefficient of variation: 1.0537
|
|
67
|
+
Effective sample size: 473,875
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
## References
|
|
72
|
+
|
|
73
|
+
- Chen, Y., Diaconis, P., Holmes, S. P., & Liu, J. S. (2005). Sequential Monte Carlo methods for statistical analysis of tables. *Journal of the American Statistical Association*, 100(469), 109-120.
|
|
74
|
+
- Diaconis, P., & Gangolli, A. (1995). Rectangular arrays with fixed margins. In *Discrete Probability and Algorithms* (pp. 15-41). Springer.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sequential-importance-sampling"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Sequential importance sampling for contingency tables with fixed margins"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "gfrt0" },
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Topic :: Scientific/Engineering :: Mathematics",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"numpy",
|
|
21
|
+
"numba",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
dev = [
|
|
26
|
+
"pytest",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Repository = "https://github.com/gfrt0/sequential-importance-sampling"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools]
|
|
33
|
+
py-modules = ["sequential_importance_sampling"]
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.package-dir]
|
|
36
|
+
"" = "src"
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sequential-importance-sampling
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Sequential importance sampling for contingency tables with fixed margins
|
|
5
|
+
Author: gfrt0
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/gfrt0/sequential-importance-sampling
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: numba
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest; extra == "dev"
|
|
17
|
+
Dynamic: license-file
|
|
18
|
+
|
|
19
|
+
# Sequential Importance Sampling for Contingency Tables
|
|
20
|
+
|
|
21
|
+
Numba-accelerated sequential importance sampling (SIS) for uniformly sampling **two-way** contingency tables with fixed row and column margins, following **Chen, Diaconis, Holmes, and Liu (2005, JASA)**.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install -e .
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import numpy as np
|
|
33
|
+
from sequential_importance_sampling import sample_tables
|
|
34
|
+
|
|
35
|
+
# Row and column margins from Chen et al. (2005)
|
|
36
|
+
row_sums = np.array([10, 62, 13, 11, 39])
|
|
37
|
+
col_sums = np.array([65, 25, 45])
|
|
38
|
+
|
|
39
|
+
# Draw 150,000 importance-weighted tables
|
|
40
|
+
tables, logq = sample_tables(row_sums, col_sums, num_samples=150_000, rng_seed=44042)
|
|
41
|
+
|
|
42
|
+
# Estimate the number of tables with these margins
|
|
43
|
+
# True value: 239,382,173 (Diaconis and Gangolli, 1995)
|
|
44
|
+
print("Estimate:", np.exp(-logq).mean())
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
**Batch dimensions** are supported for sampling many independent two-way tables in parallel. If `row_sums` has shape `(R, d1, d2, ...)` and `col_sums` has shape `(C, d1, d2, ...)`, each combination of batch indices defines an independent R×C table problem. The output `tables` will have shape `(num_samples, d1, d2, ..., R, C)` with batch dims before table dims for C-contiguous access. Note that this does **not** extend to multi-way (3+) contingency tables with additional margin constraints — each batch element is a separate two-way table.
|
|
48
|
+
|
|
49
|
+
## API
|
|
50
|
+
|
|
51
|
+
| Function | Description |
|
|
52
|
+
|---|---|
|
|
53
|
+
| `sample_tables` | Main entry point — sample batches of tables with arbitrary batch dimensions |
|
|
54
|
+
| `sample_table_sis` | Sample a single contingency table via column-wise SIS |
|
|
55
|
+
|
|
56
|
+
`sample_tables` returns `(tables, logq)` where `logq` contains the log importance weights. Pass `parallel=False` to disable numba parallelism.
|
|
57
|
+
|
|
58
|
+
## Tests
|
|
59
|
+
|
|
60
|
+
Run the Diaconis-Gangolli counting verification:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
python tests/test_diaconis_gangolli.py
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
True count: 239,382,173
|
|
68
|
+
SIS estimate (n=150k): 239,413,201
|
|
69
|
+
Coefficient of variation: 0.9512
|
|
70
|
+
Effective sample size: 78,750
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Run the Holmes-Jones example from the Chen et al. paper:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
python tests/test_holmes_jones.py
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Reference (Chen et al.): 3.383e16
|
|
81
|
+
SIS estimate (n=1M): 3.382e+16
|
|
82
|
+
log10 estimate: 16.5291
|
|
83
|
+
log10 reference: 16.5293
|
|
84
|
+
Coefficient of variation: 1.0537
|
|
85
|
+
Effective sample size: 473,875
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
## References
|
|
90
|
+
|
|
91
|
+
- Chen, Y., Diaconis, P., Holmes, S. P., & Liu, J. S. (2005). Sequential Monte Carlo methods for statistical analysis of tables. *Journal of the American Statistical Association*, 100(469), 109-120.
|
|
92
|
+
- Diaconis, P., & Gangolli, A. (1995). Rectangular arrays with fixed margins. In *Discrete Probability and Algorithms* (pp. 15-41). Springer.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/sequential_importance_sampling.py
|
|
5
|
+
src/sequential_importance_sampling.egg-info/PKG-INFO
|
|
6
|
+
src/sequential_importance_sampling.egg-info/SOURCES.txt
|
|
7
|
+
src/sequential_importance_sampling.egg-info/dependency_links.txt
|
|
8
|
+
src/sequential_importance_sampling.egg-info/requires.txt
|
|
9
|
+
src/sequential_importance_sampling.egg-info/top_level.txt
|
|
10
|
+
tests/test_diaconis_gangolli.py
|
|
11
|
+
tests/test_holmes_jones.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
sequential_importance_sampling-0.1.0/src/sequential_importance_sampling.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sequential_importance_sampling
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import numba
|
|
2
|
+
import numpy as np
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
@numba.njit
|
|
6
|
+
def count_bounded_compositions( s, bounds, K ):
|
|
7
|
+
"""
|
|
8
|
+
Counts the number of integer vectors x with sum s and x[k] <= bounds[k].
|
|
9
|
+
Uses a 2-loop DP approach with complexity O(K*s).
|
|
10
|
+
"""
|
|
11
|
+
dp = np.zeros((K + 1, s + 1), dtype=np.int64)
|
|
12
|
+
dp[0, 0] = 1
|
|
13
|
+
|
|
14
|
+
for k in range(1, K + 1):
|
|
15
|
+
bound_k = bounds[k - 1]
|
|
16
|
+
cumsum_prev_row = np.cumsum(dp[k - 1, :])
|
|
17
|
+
for j in range(s + 1):
|
|
18
|
+
upper_sum = cumsum_prev_row[j]
|
|
19
|
+
lower_sum_idx = j - bound_k - 1
|
|
20
|
+
lower_sum = cumsum_prev_row[lower_sum_idx] if lower_sum_idx >= 0 else 0
|
|
21
|
+
dp[k, j] = upper_sum - lower_sum
|
|
22
|
+
return dp[K, s]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@numba.njit
|
|
26
|
+
def _precompute_suffix_counts( column_sum, row_sums, K ):
|
|
27
|
+
"""
|
|
28
|
+
Precompute suffix_counts[k, s] = number of compositions of s into
|
|
29
|
+
(x[k], ..., x[K-1]) with x[i] <= row_sums[i].
|
|
30
|
+
|
|
31
|
+
Built bottom-up from k = K-1 down to k = 0. O(K * column_sum).
|
|
32
|
+
"""
|
|
33
|
+
S = column_sum
|
|
34
|
+
suffix = np.zeros((K + 1, S + 1), dtype=np.int64)
|
|
35
|
+
suffix[K, 0] = 1
|
|
36
|
+
|
|
37
|
+
for k in range(K - 1, -1, -1):
|
|
38
|
+
bound_k = row_sums[k]
|
|
39
|
+
cumsum_next = np.cumsum(suffix[k + 1, :])
|
|
40
|
+
for s in range(S + 1):
|
|
41
|
+
upper = cumsum_next[s]
|
|
42
|
+
lower_idx = s - bound_k - 1
|
|
43
|
+
lower = cumsum_next[lower_idx] if lower_idx >= 0 else 0
|
|
44
|
+
suffix[k, s] = upper - lower
|
|
45
|
+
|
|
46
|
+
return suffix
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@numba.njit
|
|
50
|
+
def sample_uniform_column( column_sum, row_sums, K ):
|
|
51
|
+
"""
|
|
52
|
+
Uniformly sample integer vector x with sum column_sum and x[i] <= row_sums[i],
|
|
53
|
+
using suffix DP precomputation for O(K * column_sum) total work.
|
|
54
|
+
|
|
55
|
+
Returns (x, total_count) where total_count = number of valid compositions.
|
|
56
|
+
"""
|
|
57
|
+
suffix = _precompute_suffix_counts(column_sum, row_sums, K)
|
|
58
|
+
total_count = suffix[0, column_sum]
|
|
59
|
+
|
|
60
|
+
output = np.zeros( K, dtype = np.int64 )
|
|
61
|
+
remaining = column_sum
|
|
62
|
+
|
|
63
|
+
for k in range(K - 1):
|
|
64
|
+
max_val = min( row_sums[k], remaining )
|
|
65
|
+
|
|
66
|
+
# Sample from unnormalized weights via linear scan.
|
|
67
|
+
# weight(val) = suffix[k+1, remaining - val]; sum = suffix[k, remaining].
|
|
68
|
+
target = np.random.random() * float(suffix[k, remaining])
|
|
69
|
+
cumulative = 0.0
|
|
70
|
+
chosen = max_val
|
|
71
|
+
for val in range(max_val + 1):
|
|
72
|
+
cumulative += float(suffix[k + 1, remaining - val])
|
|
73
|
+
if cumulative > target:
|
|
74
|
+
chosen = val
|
|
75
|
+
break
|
|
76
|
+
|
|
77
|
+
output[k] = chosen
|
|
78
|
+
remaining -= chosen
|
|
79
|
+
|
|
80
|
+
output[K - 1] = remaining
|
|
81
|
+
return output, total_count
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@numba.njit
|
|
85
|
+
def sample_table_sis( row_sums, col_sums, K, C ):
|
|
86
|
+
"""
|
|
87
|
+
Uniformly sample a single contingency table with fixed margins as in Chen Diaconis Holmes Liu (2005, JASA)
|
|
88
|
+
"""
|
|
89
|
+
remaining = row_sums.copy()
|
|
90
|
+
output = np.zeros( (K, C), dtype = np.int64 )
|
|
91
|
+
log_q = 0.0
|
|
92
|
+
|
|
93
|
+
for c in range(C - 1):
|
|
94
|
+
col, total_count = sample_uniform_column( col_sums[c], remaining, K )
|
|
95
|
+
output[:, c] = col
|
|
96
|
+
log_q += math.log( total_count )
|
|
97
|
+
remaining -= col
|
|
98
|
+
|
|
99
|
+
output[:, C - 1] = remaining
|
|
100
|
+
return output, -log_q
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@numba.njit
|
|
104
|
+
def _sample_tables_core(row_sums, col_sums, num_samples, rng_seed):
|
|
105
|
+
"""
|
|
106
|
+
Sequential inner loop over (num_samples, n_batch).
|
|
107
|
+
|
|
108
|
+
row_sums: (n_batch, R) — each row is one batch element's row margins
|
|
109
|
+
col_sums: (n_batch, C) — each row is one batch element's column margins
|
|
110
|
+
"""
|
|
111
|
+
n_batch = row_sums.shape[0]
|
|
112
|
+
R = row_sums.shape[1]
|
|
113
|
+
C = col_sums.shape[1]
|
|
114
|
+
|
|
115
|
+
tables = np.zeros((num_samples, n_batch, R, C), dtype=np.int64)
|
|
116
|
+
logq = np.zeros((num_samples, n_batch))
|
|
117
|
+
|
|
118
|
+
for s in range(num_samples):
|
|
119
|
+
for b in range(n_batch):
|
|
120
|
+
if rng_seed is not None:
|
|
121
|
+
np.random.seed(rng_seed + s * 104729 + b * 131)
|
|
122
|
+
table, lnq = sample_table_sis(row_sums[b].copy(), col_sums[b].copy(), R, C)
|
|
123
|
+
tables[s, b] = table
|
|
124
|
+
logq[s, b] = lnq
|
|
125
|
+
|
|
126
|
+
return tables, logq
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@numba.njit(parallel=True)
|
|
130
|
+
def _sample_tables_core_parallel(row_sums, col_sums, num_samples, rng_seed):
|
|
131
|
+
"""
|
|
132
|
+
Parallel inner loop: prange over num_samples, sequential over n_batch.
|
|
133
|
+
|
|
134
|
+
row_sums: (n_batch, R) — each row is one batch element's row margins
|
|
135
|
+
col_sums: (n_batch, C) — each row is one batch element's column margins
|
|
136
|
+
"""
|
|
137
|
+
n_batch = row_sums.shape[0]
|
|
138
|
+
R = row_sums.shape[1]
|
|
139
|
+
C = col_sums.shape[1]
|
|
140
|
+
|
|
141
|
+
tables = np.zeros((num_samples, n_batch, R, C), dtype=np.int64)
|
|
142
|
+
logq = np.zeros((num_samples, n_batch))
|
|
143
|
+
|
|
144
|
+
for s in numba.prange(num_samples):
|
|
145
|
+
for b in range(n_batch):
|
|
146
|
+
if rng_seed is not None:
|
|
147
|
+
np.random.seed(rng_seed + s * 104729 + b * 131)
|
|
148
|
+
table, lnq = sample_table_sis(row_sums[b].copy(), col_sums[b].copy(), R, C)
|
|
149
|
+
tables[s, b] = table
|
|
150
|
+
logq[s, b] = lnq
|
|
151
|
+
|
|
152
|
+
return tables, logq
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def sample_tables(row_sums, col_sums, num_samples, rng_seed=None, parallel=True):
|
|
156
|
+
"""
|
|
157
|
+
Sample contingency tables with fixed margins via sequential importance
|
|
158
|
+
sampling, following Chen, Diaconis, Holmes, and Liu (2005, JASA).
|
|
159
|
+
|
|
160
|
+
Inputs:
|
|
161
|
+
row_sums : array of shape (R, *batch) — row margins
|
|
162
|
+
col_sums : array of shape (C, *batch) — column margins
|
|
163
|
+
Batch dimensions (if any) must match between row_sums and col_sums.
|
|
164
|
+
num_samples : int — number of tables to draw
|
|
165
|
+
rng_seed : optional int — seed for reproducibility
|
|
166
|
+
parallel : bool — use numba parallel sampling (default True)
|
|
167
|
+
|
|
168
|
+
Outputs:
|
|
169
|
+
tables : int64 array of shape (num_samples, *batch, R, C)
|
|
170
|
+
logq : float64 array of shape (num_samples, *batch)
|
|
171
|
+
Log importance weights. To estimate the number of tables with
|
|
172
|
+
the given margins, compute np.exp(-logq).mean().
|
|
173
|
+
"""
|
|
174
|
+
row_sums = np.asarray(row_sums, dtype=np.int64)
|
|
175
|
+
col_sums = np.asarray(col_sums, dtype=np.int64)
|
|
176
|
+
|
|
177
|
+
R = row_sums.shape[0]
|
|
178
|
+
C = col_sums.shape[0]
|
|
179
|
+
batch_shape = row_sums.shape[1:]
|
|
180
|
+
|
|
181
|
+
assert col_sums.shape[1:] == batch_shape, (
|
|
182
|
+
f"Batch shapes must match: row_sums {row_sums.shape[1:]} vs col_sums {col_sums.shape[1:]}"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
n_batch = int(np.prod(batch_shape)) if len(batch_shape) > 0 else 1
|
|
186
|
+
|
|
187
|
+
# Flatten batch dims and transpose so each batch element's margins are a
|
|
188
|
+
# contiguous row: (R, n_batch) -> (n_batch, R), same for col_sums.
|
|
189
|
+
row_flat = np.ascontiguousarray(row_sums.reshape(R, n_batch).T)
|
|
190
|
+
col_flat = np.ascontiguousarray(col_sums.reshape(C, n_batch).T)
|
|
191
|
+
|
|
192
|
+
if parallel:
|
|
193
|
+
tables_flat, logq_flat = _sample_tables_core_parallel(
|
|
194
|
+
row_flat, col_flat, num_samples, rng_seed
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
tables_flat, logq_flat = _sample_tables_core(
|
|
198
|
+
row_flat, col_flat, num_samples, rng_seed
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
tables = tables_flat.reshape((num_samples,) + batch_shape + (R, C))
|
|
202
|
+
logq = logq_flat.reshape((num_samples,) + batch_shape)
|
|
203
|
+
|
|
204
|
+
return tables, logq
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Reproduce the counting result from Diaconis and Gangolli (1995):
|
|
3
|
+
|
|
4
|
+
The number of 5x3 contingency tables of non-negative integers with
|
|
5
|
+
row sums [10, 62, 13, 11, 39] and column sums [65, 25, 45] is
|
|
6
|
+
exactly 239,382,173.
|
|
7
|
+
|
|
8
|
+
We estimate this via sequential importance sampling following
|
|
9
|
+
Chen, Diaconis, Holmes, and Liu (2005, JASA).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from sequential_importance_sampling import sample_tables
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_diaconis_gangolli():
|
|
17
|
+
true_count = 239_382_173
|
|
18
|
+
|
|
19
|
+
row_sums = np.array([10, 62, 13, 11, 39])
|
|
20
|
+
col_sums = np.array([65, 25, 45])
|
|
21
|
+
|
|
22
|
+
_, logq = sample_tables(
|
|
23
|
+
row_sums, col_sums,
|
|
24
|
+
num_samples=150_000,
|
|
25
|
+
rng_seed=44042,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
inv_w = np.exp(-logq)
|
|
29
|
+
estimate = inv_w.mean()
|
|
30
|
+
cv = inv_w.std() / inv_w.mean()
|
|
31
|
+
ess = 150_000 / (1 + cv ** 2)
|
|
32
|
+
|
|
33
|
+
print(f"True count: {true_count:,}")
|
|
34
|
+
print(f"SIS estimate (n=150k): {estimate:,.0f}")
|
|
35
|
+
print(f"Coefficient of variation: {cv:.4f}")
|
|
36
|
+
print(f"Effective sample size: {ess:,.0f}")
|
|
37
|
+
|
|
38
|
+
# Allow 1% relative error
|
|
39
|
+
assert abs(estimate - true_count) / true_count < 0.01, (
|
|
40
|
+
f"Estimate {estimate:,.0f} too far from true count {true_count:,}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
test_diaconis_gangolli()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Reproduce the counting result from Holmes and Jones (1996):
|
|
3
|
+
|
|
4
|
+
A 5x4 contingency table with row sums [9, 49, 182, 478, 551]
|
|
5
|
+
and column sums [9, 309, 355, 596].
|
|
6
|
+
|
|
7
|
+
Chen et al. (2005, JASA) estimate the total number of tables
|
|
8
|
+
as approximately 3.383 x 10^16 via SIS (Section 6.4).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from sequential_importance_sampling import sample_tables
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_holmes_jones():
|
|
16
|
+
# Chen et al. (2005) SIS estimate: ~3.383 x 10^16
|
|
17
|
+
log10_reference = 16 + np.log10(3.383)
|
|
18
|
+
|
|
19
|
+
num_samples = 1_000_000
|
|
20
|
+
|
|
21
|
+
row_sums = np.array([9, 49, 182, 478, 551])
|
|
22
|
+
col_sums = np.array([9, 309, 355, 596])
|
|
23
|
+
|
|
24
|
+
_, logq = sample_tables(
|
|
25
|
+
row_sums, col_sums,
|
|
26
|
+
num_samples=num_samples,
|
|
27
|
+
rng_seed=44042,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
inv_w = np.exp(-logq)
|
|
31
|
+
estimate = inv_w.mean()
|
|
32
|
+
log10_estimate = np.log10(estimate)
|
|
33
|
+
cv = inv_w.std() / inv_w.mean()
|
|
34
|
+
ess = num_samples / (1 + cv ** 2)
|
|
35
|
+
|
|
36
|
+
print(f"Reference (Chen et al.): 3.383e16")
|
|
37
|
+
print(f"SIS estimate (n=1M): {estimate:.3e}")
|
|
38
|
+
print(f"log10 estimate: {log10_estimate:.4f}")
|
|
39
|
+
print(f"log10 reference: {log10_reference:.4f}")
|
|
40
|
+
print(f"Coefficient of variation: {cv:.4f}")
|
|
41
|
+
print(f"Effective sample size: {ess:,.0f}")
|
|
42
|
+
|
|
43
|
+
# Allow 1% relative error on log10 scale
|
|
44
|
+
assert abs(log10_estimate - log10_reference) / log10_reference < 0.01, (
|
|
45
|
+
f"log10 estimate {log10_estimate:.4f} too far from reference {log10_reference:.4f}"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
test_holmes_jones()
|