benchmark-reliability 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {benchmark_reliability-0.1.2/src/benchmark_reliability.egg-info → benchmark_reliability-0.1.3}/PKG-INFO +20 -20
  2. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/README.md +19 -19
  3. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/pyproject.toml +1 -1
  4. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/setup.py +1 -1
  5. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3/src/benchmark_reliability.egg-info}/PKG-INFO +20 -20
  6. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/LICENSE +0 -0
  7. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/setup.cfg +0 -0
  8. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/benchmark_reliability.egg-info/SOURCES.txt +0 -0
  9. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/benchmark_reliability.egg-info/dependency_links.txt +0 -0
  10. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/benchmark_reliability.egg-info/requires.txt +0 -0
  11. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/benchmark_reliability.egg-info/top_level.txt +0 -0
  12. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/__init__.py +0 -0
  13. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/analyzer.py +0 -0
  14. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/__init__.py +0 -0
  15. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/baseline_gap.py +0 -0
  16. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/instability.py +0 -0
  17. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/metadata.py +0 -0
  18. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/metrics/null_test.py +0 -0
  19. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/phase/__init__.py +0 -0
  20. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/phase/classifier.py +0 -0
  21. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/phase/embedding.py +0 -0
  22. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/phase/visualization.py +0 -0
  23. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/report/__init__.py +0 -0
  24. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/report/json_export.py +0 -0
  25. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/src/brf/report/latex_export.py +0 -0
  26. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/tests/test_analyzer.py +0 -0
  27. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/tests/test_metrics.py +0 -0
  28. {benchmark_reliability-0.1.2 → benchmark_reliability-0.1.3}/tests/test_phase.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: benchmark-reliability
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing for predictive benchmarks
5
5
  Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
6
6
  License: MIT
@@ -38,7 +38,7 @@ from brf.phase import plot_phase_diagram
38
38
  from brf.report import export_json
39
39
 
40
40
  analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X, y, groups=groups)
41
- print(analyzer.brf_vector) # (B, I, N, M) (S, E) class
41
+ print(analyzer.brf_vector) # (B, I, N, M) -> (S, E) -> class
42
42
 
43
43
  # Visualization
44
44
  plot_phase_diagram(
@@ -55,20 +55,20 @@ export_json(analyzer.brf_vector, "results.json")
55
55
 
56
56
  ```
57
57
  brf/
58
- ├── __init__.py
59
- ├── analyzer.py BRFAnalyzer main class
60
- ├── metrics/
61
- ├── baseline_gap.py B
62
- ├── instability.py I
63
- ├── null_test.py N (permutation test)
64
- └── metadata.py M
65
- ├── phase/
66
- ├── embedding.py S = N - I, E = B + M
67
- ├── classifier.py Reliable / Fragile / Void
68
- └── visualization.py phase diagram, clustering plot
69
- ├── report/
70
- ├── json_export.py
71
- └── latex_export.py
58
+ |-- __init__.py
59
+ |-- analyzer.py <- BRFAnalyzer main class
60
+ |-- metrics/
61
+ | |-- baseline_gap.py <- B
62
+ | |-- instability.py <- I
63
+ | |-- null_test.py <- N (permutation test)
64
+ | |-- metadata.py <- M
65
+ |-- phase/
66
+ | |-- embedding.py <- S = N - I, E = B + M
67
+ | |-- classifier.py <- Reliable / Fragile / Void
68
+ | |-- visualization.py <- phase diagram, clustering plot
69
+ |-- report/
70
+ | |-- json_export.py
71
+ | |-- latex_export.py
72
72
  ```
73
73
 
74
74
  ## Steps
@@ -86,9 +86,9 @@ brf/
86
86
 
87
87
  ### Phase 3: Documentation + distribution (1-2 weeks)
88
88
  - [x] Write README with quick-start tutorial and API docs
89
- - [ ] Publish to TestPyPI PyPI
89
+ - [ ] Publish to TestPyPI -> PyPI
90
90
  - [ ] Set up ReadTheDocs for auto-generated documentation
91
- - [ ] Add GitHub Actions CI (test on Python 3.93.12)
91
+ - [ ] Add GitHub Actions CI (test on Python 3.9-3.12)
92
92
 
93
93
  ### Phase 4: HuggingFace Hub integration (optional, 1 week)
94
94
  - [ ] Add HF dataset loading wrapper
@@ -104,7 +104,7 @@ brf/
104
104
  ## Relationship to Sister Repos
105
105
 
106
106
  - `BehaviorAudit/`: source of the audit logic; this package refactors and generalizes it
107
- - `LLMScoringAudit/`: first applied use case (MM-TBA × multiple LLMs)
107
+ - `LLMScoringAudit/`: first applied use case (MM-TBA x multiple LLMs)
108
108
  - `BenchmarkPhase/`: large-scale application (30 datasets BRF leaderboard)
109
109
  - `llm-annotation/`: cited for complementary MLLM pseudo-label reliability findings
110
110
 
@@ -115,7 +115,7 @@ brf/
115
115
 
116
116
  ## Timeline
117
117
 
118
- - Phase 12: 3 weeks
118
+ - Phase 1-2: 3 weeks
119
119
  - Phase 3: 2 weeks
120
120
  - Phase 4: optional
121
121
  - JOSS submission: after Phase 3
@@ -14,7 +14,7 @@ from brf.phase import plot_phase_diagram
14
14
  from brf.report import export_json
15
15
 
16
16
  analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X, y, groups=groups)
17
- print(analyzer.brf_vector) # (B, I, N, M) (S, E) class
17
+ print(analyzer.brf_vector) # (B, I, N, M) -> (S, E) -> class
18
18
 
19
19
  # Visualization
20
20
  plot_phase_diagram(
@@ -31,20 +31,20 @@ export_json(analyzer.brf_vector, "results.json")
31
31
 
32
32
  ```
33
33
  brf/
34
- ├── __init__.py
35
- ├── analyzer.py BRFAnalyzer main class
36
- ├── metrics/
37
- ├── baseline_gap.py B
38
- ├── instability.py I
39
- ├── null_test.py N (permutation test)
40
- └── metadata.py M
41
- ├── phase/
42
- ├── embedding.py S = N - I, E = B + M
43
- ├── classifier.py Reliable / Fragile / Void
44
- └── visualization.py phase diagram, clustering plot
45
- ├── report/
46
- ├── json_export.py
47
- └── latex_export.py
34
+ |-- __init__.py
35
+ |-- analyzer.py <- BRFAnalyzer main class
36
+ |-- metrics/
37
+ | |-- baseline_gap.py <- B
38
+ | |-- instability.py <- I
39
+ | |-- null_test.py <- N (permutation test)
40
+ | |-- metadata.py <- M
41
+ |-- phase/
42
+ | |-- embedding.py <- S = N - I, E = B + M
43
+ | |-- classifier.py <- Reliable / Fragile / Void
44
+ | |-- visualization.py <- phase diagram, clustering plot
45
+ |-- report/
46
+ | |-- json_export.py
47
+ | |-- latex_export.py
48
48
  ```
49
49
 
50
50
  ## Steps
@@ -62,9 +62,9 @@ brf/
62
62
 
63
63
  ### Phase 3: Documentation + distribution (1-2 weeks)
64
64
  - [x] Write README with quick-start tutorial and API docs
65
- - [ ] Publish to TestPyPI PyPI
65
+ - [ ] Publish to TestPyPI -> PyPI
66
66
  - [ ] Set up ReadTheDocs for auto-generated documentation
67
- - [ ] Add GitHub Actions CI (test on Python 3.93.12)
67
+ - [ ] Add GitHub Actions CI (test on Python 3.9-3.12)
68
68
 
69
69
  ### Phase 4: HuggingFace Hub integration (optional, 1 week)
70
70
  - [ ] Add HF dataset loading wrapper
@@ -80,7 +80,7 @@ brf/
80
80
  ## Relationship to Sister Repos
81
81
 
82
82
  - `BehaviorAudit/`: source of the audit logic; this package refactors and generalizes it
83
- - `LLMScoringAudit/`: first applied use case (MM-TBA × multiple LLMs)
83
+ - `LLMScoringAudit/`: first applied use case (MM-TBA x multiple LLMs)
84
84
  - `BenchmarkPhase/`: large-scale application (30 datasets BRF leaderboard)
85
85
  - `llm-annotation/`: cited for complementary MLLM pseudo-label reliability findings
86
86
 
@@ -91,7 +91,7 @@ brf/
91
91
 
92
92
  ## Timeline
93
93
 
94
- - Phase 12: 3 weeks
94
+ - Phase 1-2: 3 weeks
95
95
  - Phase 3: 2 weeks
96
96
  - Phase 4: optional
97
97
  - JOSS submission: after Phase 3
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "benchmark-reliability"
7
- version = "0.1.2"
7
+ version = "0.1.3"
8
8
  description = "Benchmark Reliability Framework (BRF) - dataset-level reliability auditing for predictive benchmarks"
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="benchmark-reliability",
5
- version="0.1.2",
5
+ version="0.1.3",
6
6
  packages=find_packages(where="src"),
7
7
  package_dir={"": "src"},
8
8
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: benchmark-reliability
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Benchmark Reliability Framework (BRF) - dataset-level reliability auditing for predictive benchmarks
5
5
  Author-email: zhanglizhuo <zhanglizhuo@gmail.com>
6
6
  License: MIT
@@ -38,7 +38,7 @@ from brf.phase import plot_phase_diagram
38
38
  from brf.report import export_json
39
39
 
40
40
  analyzer = BRFAnalyzer(n_splits=30, n_permutations=200).fit(X, y, groups=groups)
41
- print(analyzer.brf_vector) # (B, I, N, M) (S, E) class
41
+ print(analyzer.brf_vector) # (B, I, N, M) -> (S, E) -> class
42
42
 
43
43
  # Visualization
44
44
  plot_phase_diagram(
@@ -55,20 +55,20 @@ export_json(analyzer.brf_vector, "results.json")
55
55
 
56
56
  ```
57
57
  brf/
58
- ├── __init__.py
59
- ├── analyzer.py BRFAnalyzer main class
60
- ├── metrics/
61
- ├── baseline_gap.py B
62
- ├── instability.py I
63
- ├── null_test.py N (permutation test)
64
- └── metadata.py M
65
- ├── phase/
66
- ├── embedding.py S = N - I, E = B + M
67
- ├── classifier.py Reliable / Fragile / Void
68
- └── visualization.py phase diagram, clustering plot
69
- ├── report/
70
- ├── json_export.py
71
- └── latex_export.py
58
+ |-- __init__.py
59
+ |-- analyzer.py <- BRFAnalyzer main class
60
+ |-- metrics/
61
+ | |-- baseline_gap.py <- B
62
+ | |-- instability.py <- I
63
+ | |-- null_test.py <- N (permutation test)
64
+ | |-- metadata.py <- M
65
+ |-- phase/
66
+ | |-- embedding.py <- S = N - I, E = B + M
67
+ | |-- classifier.py <- Reliable / Fragile / Void
68
+ | |-- visualization.py <- phase diagram, clustering plot
69
+ |-- report/
70
+ | |-- json_export.py
71
+ | |-- latex_export.py
72
72
  ```
73
73
 
74
74
  ## Steps
@@ -86,9 +86,9 @@ brf/
86
86
 
87
87
  ### Phase 3: Documentation + distribution (1-2 weeks)
88
88
  - [x] Write README with quick-start tutorial and API docs
89
- - [ ] Publish to TestPyPI PyPI
89
+ - [ ] Publish to TestPyPI -> PyPI
90
90
  - [ ] Set up ReadTheDocs for auto-generated documentation
91
- - [ ] Add GitHub Actions CI (test on Python 3.93.12)
91
+ - [ ] Add GitHub Actions CI (test on Python 3.9-3.12)
92
92
 
93
93
  ### Phase 4: HuggingFace Hub integration (optional, 1 week)
94
94
  - [ ] Add HF dataset loading wrapper
@@ -104,7 +104,7 @@ brf/
104
104
  ## Relationship to Sister Repos
105
105
 
106
106
  - `BehaviorAudit/`: source of the audit logic; this package refactors and generalizes it
107
- - `LLMScoringAudit/`: first applied use case (MM-TBA × multiple LLMs)
107
+ - `LLMScoringAudit/`: first applied use case (MM-TBA x multiple LLMs)
108
108
  - `BenchmarkPhase/`: large-scale application (30 datasets BRF leaderboard)
109
109
  - `llm-annotation/`: cited for complementary MLLM pseudo-label reliability findings
110
110
 
@@ -115,7 +115,7 @@ brf/
115
115
 
116
116
  ## Timeline
117
117
 
118
- - Phase 12: 3 weeks
118
+ - Phase 1-2: 3 weeks
119
119
  - Phase 3: 2 weeks
120
120
  - Phase 4: optional
121
121
  - JOSS submission: after Phase 3