benchmark-reliability 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchmark_reliability-0.1.2/src/benchmark_reliability.egg-info → benchmark_reliability-0.1.3}/PKG-INFO +20 -20
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/README.md +19 -19
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/pyproject.toml +1 -1
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/setup.py +1 -1
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3/src/benchmark_reliability.egg-info}/PKG-INFO +20 -20
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/LICENSE +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/setup.cfg +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/benchmark_reliability.egg-info/SOURCES.txt +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/benchmark_reliability.egg-info/dependency_links.txt +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/benchmark_reliability.egg-info/requires.txt +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/benchmark_reliability.egg-info/top_level.txt +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/__init__.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/analyzer.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/__init__.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/baseline_gap.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/instability.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/metadata.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/null_test.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/phase/__init__.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/phase/classifier.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/phase/embedding.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/phase/visualization.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/report/__init__.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/report/json_export.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/report/latex_export.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/tests/test_analyzer.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/tests/test_metrics.py +0 -0
- {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/tests/test_phase.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: benchmark-reliability
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing for predictive benchmarks
|
|
5
5
|
Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -38,7 +38,7 @@ from brf.phase import plot_phase_diagram
|
|
|
38
38
|
from brf.report import export_json
|
|
39
39
|
|
|
40
40
|
analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X, y, groups=groups)
|
|
41
|
-
print(analyzer.brf_vector) # (B, I, N, M)
|
|
41
|
+
print(analyzer.brf_vector) # (B, I, N, M) -> (S, E) -> class
|
|
42
42
|
|
|
43
43
|
# Visualization
|
|
44
44
|
plot_phase_diagram(
|
|
@@ -55,20 +55,20 @@ export_json(analyzer.brf_vector, "results.json")
|
|
|
55
55
|
|
|
56
56
|
```
|
|
57
57
|
brf/
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
58
|
+
|-- __init__.py
|
|
59
|
+
|-- analyzer.py <- BRFAnalyzer main class
|
|
60
|
+
|-- metrics/
|
|
61
|
+
| |-- baseline_gap.py <- B
|
|
62
|
+
| |-- instability.py <- I
|
|
63
|
+
| |-- null_test.py <- N (permutation test)
|
|
64
|
+
| |-- metadata.py <- M
|
|
65
|
+
|-- phase/
|
|
66
|
+
| |-- embedding.py <- S = N - I, E = B + M
|
|
67
|
+
| |-- classifier.py <- Reliable / Fragile / Void
|
|
68
|
+
| |-- visualization.py <- phase diagram, clustering plot
|
|
69
|
+
|-- report/
|
|
70
|
+
| |-- json_export.py
|
|
71
|
+
| |-- latex_export.py
|
|
72
72
|
```
|
|
73
73
|
|
|
74
74
|
## Steps
|
|
@@ -86,9 +86,9 @@ brf/
|
|
|
86
86
|
|
|
87
87
|
### Phase 3: Documentation + distribution (1-2 weeks)
|
|
88
88
|
- [x] Write README with quick-start tutorial and API docs
|
|
89
|
-
- [ ] Publish to TestPyPI
|
|
89
|
+
- [ ] Publish to TestPyPI -> PyPI
|
|
90
90
|
- [ ] Set up ReadTheDocs for auto-generated documentation
|
|
91
|
-
- [ ] Add GitHub Actions CI (test on Python 3.9
|
|
91
|
+
- [ ] Add GitHub Actions CI (test on Python 3.9-3.12)
|
|
92
92
|
|
|
93
93
|
### Phase 4: HuggingFace Hub integration (optional, 1 week)
|
|
94
94
|
- [ ] Add HF dataset loading wrapper
|
|
@@ -104,7 +104,7 @@ brf/
|
|
|
104
104
|
## Relationship to Sister Repos
|
|
105
105
|
|
|
106
106
|
- `BehaviorAudit/`: source of the audit logic; this package refactors and generalizes it
|
|
107
|
-
- `LLMScoringAudit/`: first applied use case (MM-TBA
|
|
107
|
+
- `LLMScoringAudit/`: first applied use case (MM-TBA x multiple LLMs)
|
|
108
108
|
- `BenchmarkPhase/`: large-scale application (30 datasets BRF leaderboard)
|
|
109
109
|
- `llm-annotation/`: cited for complementary MLLM pseudo-label reliability findings
|
|
110
110
|
|
|
@@ -115,7 +115,7 @@ brf/
|
|
|
115
115
|
|
|
116
116
|
## Timeline
|
|
117
117
|
|
|
118
|
-
- Phase 1
|
|
118
|
+
- Phase 1-2: 3 weeks
|
|
119
119
|
- Phase 3: 2 weeks
|
|
120
120
|
- Phase 4: optional
|
|
121
121
|
- JOSS submission: after Phase 3
|
|
@@ -14,7 +14,7 @@ from brf.phase import plot_phase_diagram
|
|
|
14
14
|
from brf.report import export_json
|
|
15
15
|
|
|
16
16
|
analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X, y, groups=groups)
|
|
17
|
-
print(analyzer.brf_vector) # (B, I, N, M)
|
|
17
|
+
print(analyzer.brf_vector) # (B, I, N, M) -> (S, E) -> class
|
|
18
18
|
|
|
19
19
|
# Visualization
|
|
20
20
|
plot_phase_diagram(
|
|
@@ -31,20 +31,20 @@ export_json(analyzer.brf_vector, "results.json")
|
|
|
31
31
|
|
|
32
32
|
```
|
|
33
33
|
brf/
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
34
|
+
|-- __init__.py
|
|
35
|
+
|-- analyzer.py <- BRFAnalyzer main class
|
|
36
|
+
|-- metrics/
|
|
37
|
+
| |-- baseline_gap.py <- B
|
|
38
|
+
| |-- instability.py <- I
|
|
39
|
+
| |-- null_test.py <- N (permutation test)
|
|
40
|
+
| |-- metadata.py <- M
|
|
41
|
+
|-- phase/
|
|
42
|
+
| |-- embedding.py <- S = N - I, E = B + M
|
|
43
|
+
| |-- classifier.py <- Reliable / Fragile / Void
|
|
44
|
+
| |-- visualization.py <- phase diagram, clustering plot
|
|
45
|
+
|-- report/
|
|
46
|
+
| |-- json_export.py
|
|
47
|
+
| |-- latex_export.py
|
|
48
48
|
```
|
|
49
49
|
|
|
50
50
|
## Steps
|
|
@@ -62,9 +62,9 @@ brf/
|
|
|
62
62
|
|
|
63
63
|
### Phase 3: Documentation + distribution (1-2 weeks)
|
|
64
64
|
- [x] Write README with quick-start tutorial and API docs
|
|
65
|
-
- [ ] Publish to TestPyPI
|
|
65
|
+
- [ ] Publish to TestPyPI -> PyPI
|
|
66
66
|
- [ ] Set up ReadTheDocs for auto-generated documentation
|
|
67
|
-
- [ ] Add GitHub Actions CI (test on Python 3.9
|
|
67
|
+
- [ ] Add GitHub Actions CI (test on Python 3.9-3.12)
|
|
68
68
|
|
|
69
69
|
### Phase 4: HuggingFace Hub integration (optional, 1 week)
|
|
70
70
|
- [ ] Add HF dataset loading wrapper
|
|
@@ -80,7 +80,7 @@ brf/
|
|
|
80
80
|
## Relationship to Sister Repos
|
|
81
81
|
|
|
82
82
|
- `BehaviorAudit/`: source of the audit logic; this package refactors and generalizes it
|
|
83
|
-
- `LLMScoringAudit/`: first applied use case (MM-TBA
|
|
83
|
+
- `LLMScoringAudit/`: first applied use case (MM-TBA x multiple LLMs)
|
|
84
84
|
- `BenchmarkPhase/`: large-scale application (30 datasets BRF leaderboard)
|
|
85
85
|
- `llm-annotation/`: cited for complementary MLLM pseudo-label reliability findings
|
|
86
86
|
|
|
@@ -91,7 +91,7 @@ brf/
|
|
|
91
91
|
|
|
92
92
|
## Timeline
|
|
93
93
|
|
|
94
|
-
- Phase 1
|
|
94
|
+
- Phase 1-2: 3 weeks
|
|
95
95
|
- Phase 3: 2 weeks
|
|
96
96
|
- Phase 4: optional
|
|
97
97
|
- JOSS submission: after Phase 3
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "benchmark-reliability"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.3"
|
|
8
8
|
description = "Benchmark Reliability Framework (BRF) - dataset-level reliability auditing for predictive benchmarks"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: benchmark-reliability
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing for predictive benchmarks
|
|
5
5
|
Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -38,7 +38,7 @@ from brf.phase import plot_phase_diagram
|
|
|
38
38
|
from brf.report import export_json
|
|
39
39
|
|
|
40
40
|
analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X, y, groups=groups)
|
|
41
|
-
print(analyzer.brf_vector) # (B, I, N, M)
|
|
41
|
+
print(analyzer.brf_vector) # (B, I, N, M) -> (S, E) -> class
|
|
42
42
|
|
|
43
43
|
# Visualization
|
|
44
44
|
plot_phase_diagram(
|
|
@@ -55,20 +55,20 @@ export_json(analyzer.brf_vector, "results.json")
|
|
|
55
55
|
|
|
56
56
|
```
|
|
57
57
|
brf/
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
58
|
+
|-- __init__.py
|
|
59
|
+
|-- analyzer.py <- BRFAnalyzer main class
|
|
60
|
+
|-- metrics/
|
|
61
|
+
| |-- baseline_gap.py <- B
|
|
62
|
+
| |-- instability.py <- I
|
|
63
|
+
| |-- null_test.py <- N (permutation test)
|
|
64
|
+
| |-- metadata.py <- M
|
|
65
|
+
|-- phase/
|
|
66
|
+
| |-- embedding.py <- S = N - I, E = B + M
|
|
67
|
+
| |-- classifier.py <- Reliable / Fragile / Void
|
|
68
|
+
| |-- visualization.py <- phase diagram, clustering plot
|
|
69
|
+
|-- report/
|
|
70
|
+
| |-- json_export.py
|
|
71
|
+
| |-- latex_export.py
|
|
72
72
|
```
|
|
73
73
|
|
|
74
74
|
## Steps
|
|
@@ -86,9 +86,9 @@ brf/
|
|
|
86
86
|
|
|
87
87
|
### Phase 3: Documentation + distribution (1-2 weeks)
|
|
88
88
|
- [x] Write README with quick-start tutorial and API docs
|
|
89
|
-
- [ ] Publish to TestPyPI
|
|
89
|
+
- [ ] Publish to TestPyPI -> PyPI
|
|
90
90
|
- [ ] Set up ReadTheDocs for auto-generated documentation
|
|
91
|
-
- [ ] Add GitHub Actions CI (test on Python 3.9
|
|
91
|
+
- [ ] Add GitHub Actions CI (test on Python 3.9-3.12)
|
|
92
92
|
|
|
93
93
|
### Phase 4: HuggingFace Hub integration (optional, 1 week)
|
|
94
94
|
- [ ] Add HF dataset loading wrapper
|
|
@@ -104,7 +104,7 @@ brf/
|
|
|
104
104
|
## Relationship to Sister Repos
|
|
105
105
|
|
|
106
106
|
- `BehaviorAudit/`: source of the audit logic; this package refactors and generalizes it
|
|
107
|
-
- `LLMScoringAudit/`: first applied use case (MM-TBA
|
|
107
|
+
- `LLMScoringAudit/`: first applied use case (MM-TBA x multiple LLMs)
|
|
108
108
|
- `BenchmarkPhase/`: large-scale application (30 datasets BRF leaderboard)
|
|
109
109
|
- `llm-annotation/`: cited for complementary MLLM pseudo-label reliability findings
|
|
110
110
|
|
|
@@ -115,7 +115,7 @@ brf/
|
|
|
115
115
|
|
|
116
116
|
## Timeline
|
|
117
117
|
|
|
118
|
-
- Phase 1
|
|
118
|
+
- Phase 1-2: 3 weeks
|
|
119
119
|
- Phase 3: 2 weeks
|
|
120
120
|
- Phase 4: optional
|
|
121
121
|
- JOSS submission: after Phase 3
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|