pragmastat 3.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pragmastat might be problematic. Click here for more details.

@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Andrey Akinshin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,3 @@
1
+ include src/*.c
2
+ include README.md
3
+ include LICENSE
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: pragmastat
3
+ Version: 3.1.6
4
+ Summary: Pragmastat: Pragmatic Statistical Toolkit
5
+ Author: Andrey Akinshin
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://pragmastat.dev
8
+ Project-URL: Repository, https://github.com/AndreyAkinshin/pragmastat
9
+ Project-URL: DOI, https://doi.org/10.5281/zenodo.17236778
10
+ Requires-Python: >=3.8
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy>=1.20
14
+ Dynamic: license-file
15
+
16
+ # Pragmastat Python Implementation
17
+
18
+ [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.17236778.svg)](https://doi.org/10.5281/zenodo.17236778)
19
+
20
+ A Python implementation of the Pragmastat statistical toolkit, providing robust statistical estimators for reliable analysis of real-world data.
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install pragmastat
26
+ ```
27
+
28
+ ## Requirements
29
+
30
+ - Python >= 3.8
31
+ - NumPy >= 1.20
32
+
33
+ ## Usage
34
+
35
+ ```python
36
+ from pragmastat import center, spread, rel_spread, shift, ratio, avg_spread, disparity
37
+
38
+ # Basic estimators
39
+ x = [1, 2, 3, 4, 5]
40
+ print(f"Center: {center(x)}")
41
+ print(f"Spread: {spread(x)}")
42
+ print(f"RelSpread: {rel_spread(x)}")
43
+
44
+ # Comparison estimators
45
+ y = [3, 4, 5, 6, 7]
46
+ print(f"Shift: {shift(x, y)}")
47
+ print(f"Ratio: {ratio(x, y)}")
48
+ print(f"AvgSpread: {avg_spread(x, y)}")
49
+ print(f"Disparity: {disparity(x, y)}")
50
+ ```
51
+
52
+ ## Estimators
53
+
54
+ ### Single-sample estimators
55
+
56
+ - `center(x)`: Hodges-Lehmann estimator - median of all pairwise averages
57
+ - `spread(x)`: Shamos estimator - median of all pairwise absolute differences
58
+ - `rel_spread(x)`: Relative spread - spread divided by absolute center
59
+
60
+ ### Two-sample estimators
61
+
62
+ - `shift(x, y)`: Hodges-Lehmann shift estimator - median of all pairwise differences
63
+ - `ratio(x, y)`: Median of all pairwise ratios
64
+ - `avg_spread(x, y)`: Weighted average of spreads
65
+ - `disparity(x, y)`: Normalized shift - shift divided by average spread
66
+
67
+ ## Features
68
+
69
+ - Robust to outliers
70
+ - Supports both Python lists and NumPy arrays
71
+ - Type hints with numpy.typing
72
+ - Efficient vectorized NumPy operations
73
+
74
+ ## License
75
+
76
+ MIT
@@ -0,0 +1,61 @@
1
+ # Pragmastat Python Implementation
2
+
3
+ [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.17236778.svg)](https://doi.org/10.5281/zenodo.17236778)
4
+
5
+ A Python implementation of the Pragmastat statistical toolkit, providing robust statistical estimators for reliable analysis of real-world data.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install pragmastat
11
+ ```
12
+
13
+ ## Requirements
14
+
15
+ - Python >= 3.8
16
+ - NumPy >= 1.20
17
+
18
+ ## Usage
19
+
20
+ ```python
21
+ from pragmastat import center, spread, rel_spread, shift, ratio, avg_spread, disparity
22
+
23
+ # Basic estimators
24
+ x = [1, 2, 3, 4, 5]
25
+ print(f"Center: {center(x)}")
26
+ print(f"Spread: {spread(x)}")
27
+ print(f"RelSpread: {rel_spread(x)}")
28
+
29
+ # Comparison estimators
30
+ y = [3, 4, 5, 6, 7]
31
+ print(f"Shift: {shift(x, y)}")
32
+ print(f"Ratio: {ratio(x, y)}")
33
+ print(f"AvgSpread: {avg_spread(x, y)}")
34
+ print(f"Disparity: {disparity(x, y)}")
35
+ ```
36
+
37
+ ## Estimators
38
+
39
+ ### Single-sample estimators
40
+
41
+ - `center(x)`: Hodges-Lehmann estimator - median of all pairwise averages
42
+ - `spread(x)`: Shamos estimator - median of all pairwise absolute differences
43
+ - `rel_spread(x)`: Relative spread - spread divided by absolute center
44
+
45
+ ### Two-sample estimators
46
+
47
+ - `shift(x, y)`: Hodges-Lehmann shift estimator - median of all pairwise differences
48
+ - `ratio(x, y)`: Median of all pairwise ratios
49
+ - `avg_spread(x, y)`: Weighted average of spreads
50
+ - `disparity(x, y)`: Normalized shift - shift divided by average spread
51
+
52
+ ## Features
53
+
54
+ - Robust to outliers
55
+ - Supports both Python lists and NumPy arrays
56
+ - Type hints with numpy.typing
57
+ - Efficient vectorized NumPy operations
58
+
59
+ ## License
60
+
61
+ MIT
@@ -0,0 +1,51 @@
1
+ from pragmastat import center, spread, rel_spread, shift, ratio, avg_spread, disparity
2
+
3
+
4
+ def main():
5
+ x = [0, 2, 4, 6, 8]
6
+ print(center(x)) # 4
7
+ print(center([v + 10 for v in x])) # 14
8
+ print(center([v * 3 for v in x])) # 12
9
+
10
+ print(spread(x)) # 4
11
+ print(spread([v + 10 for v in x])) # 4
12
+ print(spread([v * 2 for v in x])) # 8
13
+
14
+ print(rel_spread(x)) # 1
15
+ print(rel_spread([v * 5 for v in x])) # 1
16
+
17
+ y = [10, 12, 14, 16, 18]
18
+ print(shift(x, y)) # -10
19
+ print(shift(x, x)) # 0
20
+ print(shift([v + 7 for v in x], [v + 3 for v in y])) # -6
21
+ print(shift([v * 2 for v in x], [v * 2 for v in y])) # -20
22
+ print(shift(y, x)) # 10
23
+
24
+ x = [1, 2, 4, 8, 16]
25
+ y = [2, 4, 8, 16, 32]
26
+ print(ratio(x, y)) # 0.5
27
+ print(ratio(x, x)) # 1
28
+ print(ratio([v * 2 for v in x], [v * 5 for v in y])) # 0.2
29
+
30
+ x = [0, 3, 6, 9, 12]
31
+ y = [0, 2, 4, 6, 8]
32
+ print(spread(x)) # 6
33
+ print(spread(y)) # 4
34
+
35
+ print(avg_spread(x, y)) # 5
36
+ print(avg_spread(x, x)) # 6
37
+ print(avg_spread([v * 2 for v in x], [v * 3 for v in x])) # 15
38
+ print(avg_spread(y, x)) # 5
39
+ print(avg_spread([v * 2 for v in x], [v * 2 for v in y])) # 10
40
+
41
+ print(shift(x, y)) # 2
42
+ print(avg_spread(x, y)) # 5
43
+
44
+ print(disparity(x, y)) # 0.4
45
+ print(disparity([v + 5 for v in x], [v + 5 for v in y])) # 0.4
46
+ print(disparity([v * 2 for v in x], [v * 2 for v in y])) # 0.4
47
+ print(disparity(y, x)) # -0.4
48
+
49
+
50
+ if __name__ == "__main__":
51
+ main()
@@ -0,0 +1,21 @@
1
+ from .estimators import (
2
+ center,
3
+ spread,
4
+ rel_spread,
5
+ shift,
6
+ ratio,
7
+ avg_spread,
8
+ disparity
9
+ )
10
+
11
+ __all__ = [
12
+ 'center',
13
+ 'spread',
14
+ 'rel_spread',
15
+ 'shift',
16
+ 'ratio',
17
+ 'avg_spread',
18
+ 'disparity'
19
+ ]
20
+
21
+ __version__ = '3.1.6'
@@ -0,0 +1,71 @@
1
+ from typing import Sequence, Union
2
+ import numpy as np
3
+ from numpy.typing import NDArray
4
+ from .fast_center import _fast_center
5
+ from .fast_spread import _fast_spread
6
+
7
+
8
+ def center(x: Union[Sequence[float], NDArray]) -> float:
9
+ x = np.asarray(x)
10
+ n = len(x)
11
+ if n == 0:
12
+ raise ValueError("Input array cannot be empty")
13
+ # Use fast O(n log n) algorithm
14
+ return _fast_center(x.tolist())
15
+
16
+
17
+ def spread(x: Union[Sequence[float], NDArray]) -> float:
18
+ x = np.asarray(x)
19
+ n = len(x)
20
+ if n == 0:
21
+ raise ValueError("Input array cannot be empty")
22
+ if n == 1:
23
+ return 0.0
24
+ # Use fast O(n log n) algorithm
25
+ return _fast_spread(x.tolist())
26
+
27
+
28
+ def rel_spread(x: Union[Sequence[float], NDArray]) -> float:
29
+ center_val = center(x)
30
+ if center_val == 0:
31
+ raise ValueError("RelSpread is undefined when Center equals zero")
32
+ return spread(x) / abs(center_val)
33
+
34
+
35
+ def shift(x: Union[Sequence[float], NDArray], y: Union[Sequence[float], NDArray]) -> float:
36
+ x = np.asarray(x)
37
+ y = np.asarray(y)
38
+ if len(x) == 0 or len(y) == 0:
39
+ raise ValueError("Input arrays cannot be empty")
40
+ pairwise_shifts = np.subtract.outer(x, y)
41
+ return float(np.median(pairwise_shifts))
42
+
43
+
44
+ def ratio(x: Union[Sequence[float], NDArray], y: Union[Sequence[float], NDArray]) -> float:
45
+ x = np.asarray(x)
46
+ y = np.asarray(y)
47
+ if len(x) == 0 or len(y) == 0:
48
+ raise ValueError("Input arrays cannot be empty")
49
+ if np.any(y <= 0):
50
+ raise ValueError("All values in y must be strictly positive")
51
+ pairwise_ratios = np.divide.outer(x, y)
52
+ return float(np.median(pairwise_ratios))
53
+
54
+
55
+ def avg_spread(x: Union[Sequence[float], NDArray], y: Union[Sequence[float], NDArray]) -> float:
56
+ x = np.asarray(x)
57
+ y = np.asarray(y)
58
+ n = len(x)
59
+ m = len(y)
60
+ if n == 0 or m == 0:
61
+ raise ValueError("Input arrays cannot be empty")
62
+ spread_x = spread(x)
63
+ spread_y = spread(y)
64
+ return (n * spread_x + m * spread_y) / (n + m)
65
+
66
+
67
+ def disparity(x: Union[Sequence[float], NDArray], y: Union[Sequence[float], NDArray]) -> float:
68
+ avg_spread_val = avg_spread(x, y)
69
+ if avg_spread_val == 0:
70
+ return float('inf')
71
+ return shift(x, y) / avg_spread_val
@@ -0,0 +1,215 @@
1
+ """Fast O(n log n) implementation of the Center (Hodges-Lehmann) estimator.
2
+
3
+ Based on Monahan's Algorithm 616 (1984).
4
+ """
5
+
6
+ from typing import List
7
+ import random
8
+ import numpy as np
9
+
10
+ # Try to import the C implementation, fall back to pure Python if unavailable
11
+ try:
12
+ from . import _fast_center_c
13
+ _HAS_C_EXTENSION = True
14
+ except ImportError:
15
+ _HAS_C_EXTENSION = False
16
+
17
+
18
+ def _fast_center_python(values: List[float]) -> float:
19
+ """
20
+ Pure Python implementation of fast center estimator.
21
+
22
+ Compute the median of all pairwise averages (xi + xj)/2 efficiently.
23
+
24
+ Time complexity: O(n log n) expected
25
+ Space complexity: O(n)
26
+
27
+ Args:
28
+ values: A list of numeric values
29
+
30
+ Returns:
31
+ The center estimate (Hodges-Lehmann estimator)
32
+ """
33
+ n = len(values)
34
+ if n == 0:
35
+ raise ValueError("Input array cannot be empty")
36
+ if n == 1:
37
+ return values[0]
38
+ if n == 2:
39
+ return (values[0] + values[1]) / 2
40
+
41
+ # Sort the values
42
+ sorted_values = sorted(values)
43
+
44
+ # Calculate target median rank(s) among all pairwise sums
45
+ total_pairs = n * (n + 1) // 2
46
+ median_rank_low = (total_pairs + 1) // 2 # 1-based rank
47
+ median_rank_high = (total_pairs + 2) // 2
48
+
49
+ # Initialize search bounds for each row (1-based indexing)
50
+ left_bounds = [i + 1 for i in range(n)] # Row i pairs with columns [i+1..n]
51
+ right_bounds = [n for i in range(n)]
52
+
53
+ # Start with a good pivot: sum of middle elements
54
+ pivot = sorted_values[(n - 1) // 2] + sorted_values[n // 2]
55
+ active_set_size = total_pairs
56
+ previous_count = 0
57
+
58
+ while True:
59
+ # === PARTITION STEP ===
60
+ # Count pairwise sums less than current pivot
61
+ count_below_pivot = 0
62
+ current_column = n
63
+ partition_counts = []
64
+
65
+ for row in range(1, n + 1): # 1-based
66
+ # Move left from current column until we find sums < pivot
67
+ while current_column >= row and sorted_values[row - 1] + sorted_values[current_column - 1] >= pivot:
68
+ current_column -= 1
69
+
70
+ # Count elements in this row that are < pivot
71
+ elements_below = max(0, current_column - row + 1)
72
+ partition_counts.append(elements_below)
73
+ count_below_pivot += elements_below
74
+
75
+ # === CONVERGENCE CHECK ===
76
+ if count_below_pivot == previous_count:
77
+ # No progress - use midrange strategy
78
+ min_active_sum = float('inf')
79
+ max_active_sum = float('-inf')
80
+
81
+ for i in range(n):
82
+ if left_bounds[i] > right_bounds[i]:
83
+ continue
84
+
85
+ row_value = sorted_values[i]
86
+ smallest_in_row = sorted_values[left_bounds[i] - 1] + row_value
87
+ largest_in_row = sorted_values[right_bounds[i] - 1] + row_value
88
+
89
+ min_active_sum = min(min_active_sum, smallest_in_row)
90
+ max_active_sum = max(max_active_sum, largest_in_row)
91
+
92
+ pivot = (min_active_sum + max_active_sum) / 2
93
+ if pivot <= min_active_sum or pivot > max_active_sum:
94
+ pivot = max_active_sum
95
+
96
+ if min_active_sum == max_active_sum or active_set_size <= 2:
97
+ return pivot / 2
98
+
99
+ continue
100
+
101
+ # === TARGET CHECK ===
102
+ at_target_rank = (count_below_pivot == median_rank_low or
103
+ count_below_pivot == median_rank_high - 1)
104
+
105
+ if at_target_rank:
106
+ # Find boundary values
107
+ largest_below_pivot = float('-inf')
108
+ smallest_at_or_above_pivot = float('inf')
109
+
110
+ for i in range(n):
111
+ count_in_row = partition_counts[i]
112
+ row_value = sorted_values[i]
113
+ total_in_row = n - i
114
+
115
+ # Find largest sum in this row that's < pivot
116
+ if count_in_row > 0:
117
+ last_below_index = i + count_in_row
118
+ last_below_value = row_value + sorted_values[last_below_index - 1]
119
+ largest_below_pivot = max(largest_below_pivot, last_below_value)
120
+
121
+ # Find smallest sum in this row that's >= pivot
122
+ if count_in_row < total_in_row:
123
+ first_at_or_above_index = i + count_in_row + 1
124
+ first_at_or_above_value = row_value + sorted_values[first_at_or_above_index - 1]
125
+ smallest_at_or_above_pivot = min(smallest_at_or_above_pivot, first_at_or_above_value)
126
+
127
+ # Calculate final result
128
+ if median_rank_low < median_rank_high:
129
+ # Even total: average the two middle values
130
+ return (smallest_at_or_above_pivot + largest_below_pivot) / 4
131
+ else:
132
+ # Odd total: return the single middle value
133
+ need_largest = (count_below_pivot == median_rank_low)
134
+ return (largest_below_pivot if need_largest else smallest_at_or_above_pivot) / 2
135
+
136
+ # === UPDATE BOUNDS ===
137
+ if count_below_pivot < median_rank_low:
138
+ # Too few values below pivot - search higher
139
+ for i in range(n):
140
+ left_bounds[i] = i + partition_counts[i] + 1
141
+ else:
142
+ # Too many values below pivot - search lower
143
+ for i in range(n):
144
+ right_bounds[i] = i + partition_counts[i]
145
+
146
+ # === PREPARE NEXT ITERATION ===
147
+ previous_count = count_below_pivot
148
+
149
+ # Recalculate active set size
150
+ active_set_size = sum(max(0, right_bounds[i] - left_bounds[i] + 1) for i in range(n))
151
+
152
+ # Choose next pivot
153
+ if active_set_size > 2:
154
+ # Use randomized row median strategy
155
+ target_index = random.randint(0, active_set_size - 1)
156
+ cumulative_size = 0
157
+ selected_row = 0
158
+
159
+ for i in range(n):
160
+ row_size = max(0, right_bounds[i] - left_bounds[i] + 1)
161
+ if target_index < cumulative_size + row_size:
162
+ selected_row = i
163
+ break
164
+ cumulative_size += row_size
165
+
166
+ # Use median element of the selected row as pivot
167
+ median_column_in_row = (left_bounds[selected_row] + right_bounds[selected_row]) // 2
168
+ pivot = sorted_values[selected_row] + sorted_values[median_column_in_row - 1]
169
+ else:
170
+ # Few elements remain - use midrange strategy
171
+ min_remaining_sum = float('inf')
172
+ max_remaining_sum = float('-inf')
173
+
174
+ for i in range(n):
175
+ if left_bounds[i] > right_bounds[i]:
176
+ continue
177
+
178
+ row_value = sorted_values[i]
179
+ min_in_row = sorted_values[left_bounds[i] - 1] + row_value
180
+ max_in_row = sorted_values[right_bounds[i] - 1] + row_value
181
+
182
+ min_remaining_sum = min(min_remaining_sum, min_in_row)
183
+ max_remaining_sum = max(max_remaining_sum, max_in_row)
184
+
185
+ pivot = (min_remaining_sum + max_remaining_sum) / 2
186
+ if pivot <= min_remaining_sum or pivot > max_remaining_sum:
187
+ pivot = max_remaining_sum
188
+
189
+ if min_remaining_sum == max_remaining_sum:
190
+ return pivot / 2
191
+
192
+
193
+ def _fast_center(values: List[float]) -> float:
194
+ """
195
+ Compute the median of all pairwise averages (xi + xj)/2 efficiently.
196
+
197
+ Internal implementation - not part of public API.
198
+ Uses C implementation if available, falls back to pure Python.
199
+
200
+ Time complexity: O(n log n) expected
201
+ Space complexity: O(n)
202
+
203
+ Args:
204
+ values: A list of numeric values
205
+
206
+ Returns:
207
+ The center estimate (Hodges-Lehmann estimator)
208
+ """
209
+ if _HAS_C_EXTENSION:
210
+ # Convert to numpy array and use C implementation
211
+ arr = np.asarray(values, dtype=np.float64)
212
+ return _fast_center_c.fast_center_c(arr)
213
+ else:
214
+ # Fall back to pure Python implementation
215
+ return _fast_center_python(values)