sadbert 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sadbert-0.1.0/LICENSE +32 -0
- sadbert-0.1.0/MANIFEST.in +14 -0
- sadbert-0.1.0/PKG-INFO +270 -0
- sadbert-0.1.0/README.md +199 -0
- sadbert-0.1.0/pyproject.toml +94 -0
- sadbert-0.1.0/requirements.txt +9 -0
- sadbert-0.1.0/sadbert/__init__.py +40 -0
- sadbert-0.1.0/sadbert/core.py +543 -0
- sadbert-0.1.0/sadbert/data/ROC_dict.pkl +0 -0
- sadbert-0.1.0/sadbert/data/__init__.py +1 -0
- sadbert-0.1.0/sadbert/data/interpretation_dict.pkl +0 -0
- sadbert-0.1.0/sadbert/data/label_mappings.pkl +0 -0
- sadbert-0.1.0/sadbert.egg-info/PKG-INFO +270 -0
- sadbert-0.1.0/sadbert.egg-info/SOURCES.txt +18 -0
- sadbert-0.1.0/sadbert.egg-info/dependency_links.txt +1 -0
- sadbert-0.1.0/sadbert.egg-info/requires.txt +13 -0
- sadbert-0.1.0/sadbert.egg-info/top_level.txt +1 -0
- sadbert-0.1.0/setup.cfg +4 -0
- sadbert-0.1.0/setup.py +9 -0
- sadbert-0.1.0/tests/test_sadbert.py +160 -0
sadbert-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
SADBERT Software License
|
|
2
|
+
Copyright (c) 2026 Xander Deanhardt
|
|
3
|
+
|
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
5
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
6
|
+
in the Software without restriction, including without limitation the rights
|
|
7
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
9
|
+
furnished to do so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
1. The above copyright notice and this permission notice shall be included in
|
|
12
|
+
all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
2. RESERVATION OF ACADEMIC PUBLICATION RIGHTS: Xander Deanhardt retains
|
|
15
|
+
exclusive rights to publish, author, and submit academic research papers,
|
|
16
|
+
journal articles, conference proceedings, theses, dissertations, and other
|
|
17
|
+
scholarly works that present the design, architecture, training methodology,
|
|
18
|
+
fine-tuning procedures, evaluation, or scientific contributions of this
|
|
19
|
+
Software or its underlying models as original research contributions.
|
|
20
|
+
Users of the Software may freely publish work in which this Software is
|
|
21
|
+
used as a tool or cited as a resource, but may not publish work that claims
|
|
22
|
+
the Software's methodology, training procedures, or model architecture as
|
|
23
|
+
the user's own original contributions without prior written permission from
|
|
24
|
+
Xander Deanhardt.
|
|
25
|
+
|
|
26
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
27
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
28
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
29
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
30
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
31
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
32
|
+
SOFTWARE.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Include the bundled data files in source distributions (sdist)
|
|
2
|
+
recursive-include sadbert/data *.pkl
|
|
3
|
+
|
|
4
|
+
# Include documentation and metadata
|
|
5
|
+
include README.md
|
|
6
|
+
include LICENSE
|
|
7
|
+
include pyproject.toml
|
|
8
|
+
include requirements.txt
|
|
9
|
+
|
|
10
|
+
# Exclude build artifacts and caches
|
|
11
|
+
exclude .gitignore
|
|
12
|
+
recursive-exclude * __pycache__
|
|
13
|
+
recursive-exclude * *.py[co]
|
|
14
|
+
recursive-exclude * .DS_Store
|
sadbert-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sadbert
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Stereotype-content analysis with fine-tuned DistilBERT models (SADCAT framework)
|
|
5
|
+
Author-email: Xander Deanhardt <xanderd24@uchicago.edu>
|
|
6
|
+
License: SADBERT Software License
|
|
7
|
+
Copyright (c) 2026 Xander Deanhardt
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
in the Software without restriction, including without limitation the rights
|
|
12
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
+
furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
1. The above copyright notice and this permission notice shall be included in
|
|
17
|
+
all copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
2. RESERVATION OF ACADEMIC PUBLICATION RIGHTS: Xander Deanhardt retains
|
|
20
|
+
exclusive rights to publish, author, and submit academic research papers,
|
|
21
|
+
journal articles, conference proceedings, theses, dissertations, and other
|
|
22
|
+
scholarly works that present the design, architecture, training methodology,
|
|
23
|
+
fine-tuning procedures, evaluation, or scientific contributions of this
|
|
24
|
+
Software or its underlying models as original research contributions.
|
|
25
|
+
Users of the Software may freely publish work in which this Software is
|
|
26
|
+
used as a tool or cited as a resource, but may not publish work that claims
|
|
27
|
+
the Software's methodology, training procedures, or model architecture as
|
|
28
|
+
the user's own original contributions without prior written permission from
|
|
29
|
+
Xander Deanhardt.
|
|
30
|
+
|
|
31
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
32
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
33
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
34
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
35
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
36
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
37
|
+
SOFTWARE.
|
|
38
|
+
|
|
39
|
+
Project-URL: Homepage, https://github.com/XanderD24/sadbert
|
|
40
|
+
Project-URL: Repository, https://github.com/XanderD24/sadbert
|
|
41
|
+
Project-URL: Bug Tracker, https://github.com/XanderD24/sadbert/issues
|
|
42
|
+
Project-URL: HuggingFace, https://huggingface.co/XanderD24
|
|
43
|
+
Keywords: NLP,stereotype,social cognition,SADCAT,DistilBERT,transformers,sentiment analysis
|
|
44
|
+
Classifier: Development Status :: 3 - Alpha
|
|
45
|
+
Classifier: Intended Audience :: Science/Research
|
|
46
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
47
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
48
|
+
Classifier: License :: Other/Proprietary License
|
|
49
|
+
Classifier: Programming Language :: Python :: 3
|
|
50
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
51
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
52
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
53
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
54
|
+
Classifier: Operating System :: OS Independent
|
|
55
|
+
Requires-Python: >=3.9
|
|
56
|
+
Description-Content-Type: text/markdown
|
|
57
|
+
License-File: LICENSE
|
|
58
|
+
Requires-Dist: torch>=2.0.0
|
|
59
|
+
Requires-Dist: transformers>=4.35.0
|
|
60
|
+
Requires-Dist: pandas>=2.0.0
|
|
61
|
+
Requires-Dist: numpy>=1.24.0
|
|
62
|
+
Requires-Dist: huggingface_hub>=0.19.0
|
|
63
|
+
Provides-Extra: dev
|
|
64
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
65
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
66
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
67
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
68
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
69
|
+
Requires-Dist: twine>=4.0; extra == "dev"
|
|
70
|
+
Dynamic: license-file
|
|
71
|
+
|
|
72
|
+
# SADBERT
|
|
73
|
+
|
|
74
|
+
**Stereotype-content Analysis with DistilBERT** — a Python package for identifying and characterising stereotype-relevant dimensions in natural language text, based on the SADCAT (Stereotype-Associated Dictionary of Content and Affect with Traits) framework.
|
|
75
|
+
|
|
76
|
+
## What it does
|
|
77
|
+
|
|
78
|
+
Given a word or phrase, SADBERT predicts:
|
|
79
|
+
|
|
80
|
+
| Output column | Description |
|
|
81
|
+
|---|---|
|
|
82
|
+
| `category` | Stereotype-content dimension (e.g. *Warmth*, *Competence*) |
|
|
83
|
+
| `probability` | Confidence of the category classifier |
|
|
84
|
+
| `valence` | Direction within the category: `1` = positive, `0` = neutral, `−1` = negative |
|
|
85
|
+
| `valence probability` | Confidence of the valence prediction |
|
|
86
|
+
| `interpretation` | Human-readable label (e.g. *"Warm"*, *"Incompetent"*, *"Moral"*) |
|
|
87
|
+
|
|
88
|
+
### Categories detected
|
|
89
|
+
|
|
90
|
+
**Major** (with valence): Warmth · Competence · Sociability · Morality · Ability · Assertiveness · Status · Beliefs · health · deviance · beauty · Politics · Religion
|
|
91
|
+
|
|
92
|
+
**Minor** (category only, no valence): emotions · Geography · Appearance · occupation · socialgroups · inhabitant · country · relative · insults · stem · humanities · art · Lacksknowledge · fortune · clothing · bodpart · bodprop · skin · bodcov · beliefsother · Other\_large · Other
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Installation
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install sadbert
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
> **Note:** On first use, SADBERT automatically downloads ~2 GB of model weights from the HuggingFace Hub. These are cached locally in `~/.cache/huggingface/` and do not need to be re-downloaded on subsequent runs.
|
|
103
|
+
|
|
104
|
+
### GPU / Apple Silicon
|
|
105
|
+
|
|
106
|
+
SADBERT auto-detects CUDA and Apple MPS. To use a specific device, instantiate `SADBERT` directly:
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from sadbert import SADBERT
|
|
110
|
+
model = SADBERT(device="cuda") # or "mps", "cpu"
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## Quick Start
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
import sadbert
|
|
119
|
+
|
|
120
|
+
# Single word — returns a DataFrame
|
|
121
|
+
df = sadbert.get_stereotype_content("honest")
|
|
122
|
+
print(df)
|
|
123
|
+
|
|
124
|
+
# Multiple words — stacked into one DataFrame (default)
|
|
125
|
+
df = sadbert.get_stereotype_content(["honest", "lazy", "senator"])
|
|
126
|
+
print(df)
|
|
127
|
+
|
|
128
|
+
# Multiple words — one DataFrame per word
|
|
129
|
+
results = sadbert.get_stereotype_content(["honest", "lazy"], stacked=False)
|
|
130
|
+
print(results["honest"])
|
|
131
|
+
print(results["lazy"])
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Example output
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
>>> sadbert.get_stereotype_content("honest")
|
|
138
|
+
|
|
139
|
+
category probability valence valence probability interpretation
|
|
140
|
+
0 Warmth 0.912 1.0 0.876 Warm
|
|
141
|
+
1 Morality 0.843 1.0 0.791 Moral
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## API reference
|
|
147
|
+
|
|
148
|
+
### `sadbert.get_stereotype_content(text, stacked=True)`
|
|
149
|
+
|
|
150
|
+
Module-level convenience function. Uses a shared, lazily-initialised `SADBERT` instance.
|
|
151
|
+
|
|
152
|
+
| Parameter | Type | Description |
|
|
153
|
+
|---|---|---|
|
|
154
|
+
| `text` | `str` or `list[str]` | Word(s) or phrase(s) to classify |
|
|
155
|
+
| `stacked` | `bool` | `True` (default): return one combined DataFrame with a `"text"` column. `False`: return a `dict[str, DataFrame]`. For single string input with `stacked=False`, returns the DataFrame directly. |
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
### `sadbert.SADBERT(device=None, batch_size=32, load_models=True)`
|
|
160
|
+
|
|
161
|
+
Instantiate your own SADBERT object for full control.
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from sadbert import SADBERT
|
|
165
|
+
|
|
166
|
+
model = SADBERT(
|
|
167
|
+
device="cuda", # "cuda" | "mps" | "cpu" | None (auto-detect)
|
|
168
|
+
batch_size=64, # increase for faster throughput on GPU
|
|
169
|
+
load_models=True, # set False to defer model loading to first call
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
results = model.get_stereotype_content(["nurse", "engineer", "senator"])
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Model architecture
|
|
178
|
+
|
|
179
|
+
SADBERT uses a three-stage ensemble:
|
|
180
|
+
|
|
181
|
+
```
|
|
182
|
+
Input text
|
|
183
|
+
│
|
|
184
|
+
▼
|
|
185
|
+
┌─────────────────────────────────────────────────────┐
|
|
186
|
+
│ Stage 1 · Master model (SADBERT_master_model) │
|
|
187
|
+
│ Multi-label DistilBERT, 35 output classes │
|
|
188
|
+
│ Softmax probabilities compared against per-class │
|
|
189
|
+
│ Youden-J thresholds → candidate categories │
|
|
190
|
+
└─────────────────────────────────────────────────────┘
|
|
191
|
+
│ candidate categories
|
|
192
|
+
▼
|
|
193
|
+
┌─────────────────────────────────────────────────────┐
|
|
194
|
+
│ Stage 2 · Classifier heads (SADBERT_{cat}_classifier) │
|
|
195
|
+
│ One binary DistilBERT per category │
|
|
196
|
+
│ Veto gate — keeps only categories confirmed by │
|
|
197
|
+
│ both master model and dedicated head │
|
|
198
|
+
└─────────────────────────────────────────────────────┘
|
|
199
|
+
│ confirmed categories + probabilities
|
|
200
|
+
▼
|
|
201
|
+
┌─────────────────────────────────────────────────────┐
|
|
202
|
+
│ Stage 3 · Sentiment models (SADBERT_{cat}_sentiment) │
|
|
203
|
+
│ One 3-class DistilBERT per major category │
|
|
204
|
+
│ Predicts negative / neutral / positive valence │
|
|
205
|
+
└─────────────────────────────────────────────────────┘
|
|
206
|
+
│
|
|
207
|
+
▼
|
|
208
|
+
Results DataFrame
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
All models are hosted on HuggingFace at [huggingface.co/XanderD24](https://huggingface.co/XanderD24).
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## Building from source
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
git clone https://github.com/XanderD24/sadbert.git
|
|
219
|
+
cd sadbert
|
|
220
|
+
|
|
221
|
+
# Install in editable mode with dev dependencies
|
|
222
|
+
pip install -e ".[dev]"
|
|
223
|
+
|
|
224
|
+
# Run tests
|
|
225
|
+
pytest
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### Adding ROC_dict.pkl before publishing
|
|
229
|
+
|
|
230
|
+
`ROC_dict.pkl` (per-category probability thresholds) is **required** but not included in the repository for size reasons. Copy it into the data directory before building the wheel:
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
cp /path/to/ROC_dict.pkl sadbert/data/ROC_dict.pkl
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Then build:
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
python -m build # produces dist/sadbert-0.1.0.tar.gz and .whl
|
|
240
|
+
twine check dist/* # sanity-check before uploading
|
|
241
|
+
twine upload dist/* # publish to PyPI
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## Citation
|
|
247
|
+
|
|
248
|
+
If you use SADBERT in research, please cite the underlying SADCAT framework:
|
|
249
|
+
|
|
250
|
+
```
|
|
251
|
+
@misc{sadbert2025,
|
|
252
|
+
author = {Deanhardt, Xander},
|
|
253
|
+
title = {{SADBERT}: Stereotype-content Analysis with {DistilBERT}},
|
|
254
|
+
year = {2025},
|
|
255
|
+
url = {https://github.com/XanderD24/sadbert},
|
|
256
|
+
}
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
## Citation of Original Work
|
|
261
|
+
All data used to fine-tune these models was taken from the SADCAT dictionary, published by Gandalf Nicolas, Xuecunzi Bai, and Susan T Fiske, and hosted on Github by Gandalf Nicolas. It was first published in this journal:
|
|
262
|
+
|
|
263
|
+
Nicolas, Gandalf, et al. “Comprehensive Stereotype Content Dictionaries Using a Semi‐Automated Method.” European Journal of Social Psychology, vol. 51, no. 1, Feb. 2021, pp. 178–196, https://doi.org/10.1002/ejsp.2724.
|
|
264
|
+
|
|
265
|
+
Github Link = {https://github.com/gandalfnicolas/SADCAT/tree/master}
|
|
266
|
+
|
|
267
|
+
OSF Repository = {https://osf.io/yx45f/}
|
|
268
|
+
## License
|
|
269
|
+
|
|
270
|
+
MIT License — see [LICENSE](LICENSE) for details.
|
sadbert-0.1.0/README.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# SADBERT
|
|
2
|
+
|
|
3
|
+
**Stereotype-content Analysis with DistilBERT** — a Python package for identifying and characterising stereotype-relevant dimensions in natural language text, based on the SADCAT (Stereotype-Associated Dictionary of Content and Affect with Traits) framework.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
Given a word or phrase, SADBERT predicts:
|
|
8
|
+
|
|
9
|
+
| Output column | Description |
|
|
10
|
+
|---|---|
|
|
11
|
+
| `category` | Stereotype-content dimension (e.g. *Warmth*, *Competence*) |
|
|
12
|
+
| `probability` | Confidence of the category classifier |
|
|
13
|
+
| `valence` | Direction within the category: `1` = positive, `0` = neutral, `−1` = negative |
|
|
14
|
+
| `valence probability` | Confidence of the valence prediction |
|
|
15
|
+
| `interpretation` | Human-readable label (e.g. *"Warm"*, *"Incompetent"*, *"Moral"*) |
|
|
16
|
+
|
|
17
|
+
### Categories detected
|
|
18
|
+
|
|
19
|
+
**Major** (with valence): Warmth · Competence · Sociability · Morality · Ability · Assertiveness · Status · Beliefs · health · deviance · beauty · Politics · Religion
|
|
20
|
+
|
|
21
|
+
**Minor** (category only, no valence): emotions · Geography · Appearance · occupation · socialgroups · inhabitant · country · relative · insults · stem · humanities · art · Lacksknowledge · fortune · clothing · bodpart · bodprop · skin · bodcov · beliefsother · Other\_large · Other
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install sadbert
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
> **Note:** On first use, SADBERT automatically downloads ~2 GB of model weights from the HuggingFace Hub. These are cached locally in `~/.cache/huggingface/` and do not need to be re-downloaded on subsequent runs.
|
|
32
|
+
|
|
33
|
+
### GPU / Apple Silicon
|
|
34
|
+
|
|
35
|
+
SADBERT auto-detects CUDA and Apple MPS. To use a specific device, instantiate `SADBERT` directly:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from sadbert import SADBERT
|
|
39
|
+
model = SADBERT(device="cuda") # or "mps", "cpu"
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Quick Start
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import sadbert
|
|
48
|
+
|
|
49
|
+
# Single word — returns a DataFrame
|
|
50
|
+
df = sadbert.get_stereotype_content("honest")
|
|
51
|
+
print(df)
|
|
52
|
+
|
|
53
|
+
# Multiple words — stacked into one DataFrame (default)
|
|
54
|
+
df = sadbert.get_stereotype_content(["honest", "lazy", "senator"])
|
|
55
|
+
print(df)
|
|
56
|
+
|
|
57
|
+
# Multiple words — one DataFrame per word
|
|
58
|
+
results = sadbert.get_stereotype_content(["honest", "lazy"], stacked=False)
|
|
59
|
+
print(results["honest"])
|
|
60
|
+
print(results["lazy"])
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Example output
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
>>> sadbert.get_stereotype_content("honest")
|
|
67
|
+
|
|
68
|
+
category probability valence valence probability interpretation
|
|
69
|
+
0 Warmth 0.912 1.0 0.876 Warm
|
|
70
|
+
1 Morality 0.843 1.0 0.791 Moral
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## API reference
|
|
76
|
+
|
|
77
|
+
### `sadbert.get_stereotype_content(text, stacked=True)`
|
|
78
|
+
|
|
79
|
+
Module-level convenience function. Uses a shared, lazily-initialised `SADBERT` instance.
|
|
80
|
+
|
|
81
|
+
| Parameter | Type | Description |
|
|
82
|
+
|---|---|---|
|
|
83
|
+
| `text` | `str` or `list[str]` | Word(s) or phrase(s) to classify |
|
|
84
|
+
| `stacked` | `bool` | `True` (default): return one combined DataFrame with a `"text"` column. `False`: return a `dict[str, DataFrame]`. For single string input with `stacked=False`, returns the DataFrame directly. |
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
### `sadbert.SADBERT(device=None, batch_size=32, load_models=True)`
|
|
89
|
+
|
|
90
|
+
Instantiate your own SADBERT object for full control.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from sadbert import SADBERT
|
|
94
|
+
|
|
95
|
+
model = SADBERT(
|
|
96
|
+
device="cuda", # "cuda" | "mps" | "cpu" | None (auto-detect)
|
|
97
|
+
batch_size=64, # increase for faster throughput on GPU
|
|
98
|
+
load_models=True, # set False to defer model loading to first call
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
results = model.get_stereotype_content(["nurse", "engineer", "senator"])
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Model architecture
|
|
107
|
+
|
|
108
|
+
SADBERT uses a three-stage ensemble:
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
Input text
|
|
112
|
+
│
|
|
113
|
+
▼
|
|
114
|
+
┌─────────────────────────────────────────────────────┐
|
|
115
|
+
│ Stage 1 · Master model (SADBERT_master_model) │
|
|
116
|
+
│ Multi-label DistilBERT, 35 output classes │
|
|
117
|
+
│ Softmax probabilities compared against per-class │
|
|
118
|
+
│ Youden-J thresholds → candidate categories │
|
|
119
|
+
└─────────────────────────────────────────────────────┘
|
|
120
|
+
│ candidate categories
|
|
121
|
+
▼
|
|
122
|
+
┌─────────────────────────────────────────────────────┐
|
|
123
|
+
│ Stage 2 · Classifier heads (SADBERT_{cat}_classifier) │
|
|
124
|
+
│ One binary DistilBERT per category │
|
|
125
|
+
│ Veto gate — keeps only categories confirmed by │
|
|
126
|
+
│ both master model and dedicated head │
|
|
127
|
+
└─────────────────────────────────────────────────────┘
|
|
128
|
+
│ confirmed categories + probabilities
|
|
129
|
+
▼
|
|
130
|
+
┌─────────────────────────────────────────────────────┐
|
|
131
|
+
│ Stage 3 · Sentiment models (SADBERT_{cat}_sentiment) │
|
|
132
|
+
│ One 3-class DistilBERT per major category │
|
|
133
|
+
│ Predicts negative / neutral / positive valence │
|
|
134
|
+
└─────────────────────────────────────────────────────┘
|
|
135
|
+
│
|
|
136
|
+
▼
|
|
137
|
+
Results DataFrame
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
All models are hosted on HuggingFace at [huggingface.co/XanderD24](https://huggingface.co/XanderD24).
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Building from source
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
git clone https://github.com/XanderD24/sadbert.git
|
|
148
|
+
cd sadbert
|
|
149
|
+
|
|
150
|
+
# Install in editable mode with dev dependencies
|
|
151
|
+
pip install -e ".[dev]"
|
|
152
|
+
|
|
153
|
+
# Run tests
|
|
154
|
+
pytest
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Adding ROC_dict.pkl before publishing
|
|
158
|
+
|
|
159
|
+
`ROC_dict.pkl` (per-category probability thresholds) is **required** but not included in the repository for size reasons. Copy it into the data directory before building the wheel:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
cp /path/to/ROC_dict.pkl sadbert/data/ROC_dict.pkl
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Then build:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
python -m build # produces dist/sadbert-0.1.0.tar.gz and .whl
|
|
169
|
+
twine check dist/* # sanity-check before uploading
|
|
170
|
+
twine upload dist/* # publish to PyPI
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Citation
|
|
176
|
+
|
|
177
|
+
If you use SADBERT in research, please cite the underlying SADCAT framework:
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
@misc{sadbert2025,
|
|
181
|
+
author = {Deanhardt, Xander},
|
|
182
|
+
title = {{SADBERT}: Stereotype-content Analysis with {DistilBERT}},
|
|
183
|
+
year = {2025},
|
|
184
|
+
url = {https://github.com/XanderD24/sadbert},
|
|
185
|
+
}
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
## Citation of Original Work
|
|
190
|
+
All data used to fine-tune these models was taken from the SADCAT dictionary, published by Gandalf Nicolas, Xuecunzi Bai, and Susan T Fiske, and hosted on Github by Gandalf Nicolas. It was first published in this journal:
|
|
191
|
+
|
|
192
|
+
Nicolas, Gandalf, et al. “Comprehensive Stereotype Content Dictionaries Using a Semi‐Automated Method.” European Journal of Social Psychology, vol. 51, no. 1, Feb. 2021, pp. 178–196, https://doi.org/10.1002/ejsp.2724.
|
|
193
|
+
|
|
194
|
+
Github Link = {https://github.com/gandalfnicolas/SADCAT/tree/master}
|
|
195
|
+
|
|
196
|
+
OSF Repository = {https://osf.io/yx45f/}
|
|
197
|
+
## License
|
|
198
|
+
|
|
199
|
+
MIT License — see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
6
|
+
# Project metadata
|
|
7
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
8
|
+
[project]
|
|
9
|
+
name = "sadbert"
|
|
10
|
+
version = "0.1.0"
|
|
11
|
+
description = "Stereotype-content analysis with fine-tuned DistilBERT models (SADCAT framework)"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = { file = "LICENSE" }
|
|
14
|
+
requires-python = ">=3.9"
|
|
15
|
+
|
|
16
|
+
authors = [
|
|
17
|
+
{ name = "Xander Deanhardt", email = "xanderd24@uchicago.edu" },
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
keywords = [
|
|
21
|
+
"NLP", "stereotype", "social cognition", "SADCAT",
|
|
22
|
+
"DistilBERT", "transformers", "sentiment analysis",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Development Status :: 3 - Alpha",
|
|
27
|
+
"Intended Audience :: Science/Research",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
29
|
+
"Topic :: Text Processing :: Linguistic",
|
|
30
|
+
"License :: Other/Proprietary License",
|
|
31
|
+
"Programming Language :: Python :: 3",
|
|
32
|
+
"Programming Language :: Python :: 3.9",
|
|
33
|
+
"Programming Language :: Python :: 3.10",
|
|
34
|
+
"Programming Language :: Python :: 3.11",
|
|
35
|
+
"Programming Language :: Python :: 3.12",
|
|
36
|
+
"Operating System :: OS Independent",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
# ── Runtime dependencies ──────────────────────────────────────────────────────
|
|
40
|
+
dependencies = [
|
|
41
|
+
"torch>=2.0.0",
|
|
42
|
+
"transformers>=4.35.0",
|
|
43
|
+
"pandas>=2.0.0",
|
|
44
|
+
"numpy>=1.24.0",
|
|
45
|
+
"huggingface_hub>=0.19.0",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# ── Optional extras ───────────────────────────────────────────────────────────
|
|
49
|
+
[project.optional-dependencies]
|
|
50
|
+
dev = [
|
|
51
|
+
"pytest>=7.0",
|
|
52
|
+
"pytest-cov>=4.0",
|
|
53
|
+
"black>=23.0",
|
|
54
|
+
"ruff>=0.1.0",
|
|
55
|
+
"build>=1.0",
|
|
56
|
+
"twine>=4.0",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# ── URLs ──────────────────────────────────────────────────────────────────────
|
|
60
|
+
[project.urls]
|
|
61
|
+
Homepage = "https://github.com/XanderD24/sadbert"
|
|
62
|
+
Repository = "https://github.com/XanderD24/sadbert"
|
|
63
|
+
"Bug Tracker" = "https://github.com/XanderD24/sadbert/issues"
|
|
64
|
+
"HuggingFace" = "https://huggingface.co/XanderD24"
|
|
65
|
+
|
|
66
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
67
|
+
# Package discovery & data files
|
|
68
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
69
|
+
[tool.setuptools.packages.find]
|
|
70
|
+
where = ["."] # look in the repo root
|
|
71
|
+
include = ["sadbert*"] # include sadbert and sadbert.data
|
|
72
|
+
|
|
73
|
+
[tool.setuptools.package-data]
|
|
74
|
+
# Bundle the static .pkl files so they are included in the wheel
|
|
75
|
+
"sadbert.data" = ["*.pkl"]
|
|
76
|
+
|
|
77
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
78
|
+
# Tool configuration
|
|
79
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
80
|
+
[tool.black]
|
|
81
|
+
line-length = 88
|
|
82
|
+
target-version = ["py39", "py310", "py311"]
|
|
83
|
+
|
|
84
|
+
[tool.ruff]
|
|
85
|
+
line-length = 88
|
|
86
|
+
select = ["E", "F", "W", "I"]
|
|
87
|
+
ignore = ["E501"]
|
|
88
|
+
|
|
89
|
+
[tool.pytest.ini_options]
|
|
90
|
+
testpaths = ["tests"]
|
|
91
|
+
addopts = "-v --tb=short"
|
|
92
|
+
markers = [
|
|
93
|
+
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
94
|
+
]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Runtime dependencies for SADBERT
|
|
2
|
+
# These are the same constraints declared in pyproject.toml [project.dependencies]
|
|
3
|
+
# and are provided here for convenience (e.g. pip install -r requirements.txt).
|
|
4
|
+
|
|
5
|
+
torch>=2.0.0
|
|
6
|
+
transformers>=4.35.0
|
|
7
|
+
pandas>=2.0.0
|
|
8
|
+
numpy>=1.24.0
|
|
9
|
+
huggingface_hub>=0.19.0
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SADBERT — Stereotype-content Analysis with DistilBERT
|
|
3
|
+
======================================================
|
|
4
|
+
|
|
5
|
+
A Python package for identifying stereotype content dimensions
|
|
6
|
+
(warmth, competence, morality, …) in natural language text and
|
|
7
|
+
classifying their valence (positive / neutral / negative).
|
|
8
|
+
|
|
9
|
+
Quick start
|
|
10
|
+
-----------
|
|
11
|
+
>>> import sadbert
|
|
12
|
+
>>> sadbert.get_stereotype_content("She is a warm and caring nurse.")
|
|
13
|
+
|
|
14
|
+
Or use the class directly for more control:
|
|
15
|
+
|
|
16
|
+
>>> from sadbert import SADBERT
|
|
17
|
+
>>> model = SADBERT(device="cuda", batch_size=64)
|
|
18
|
+
>>> model.get_stereotype_content(["honest", "lazy", "brilliant"])
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from .core import (
|
|
22
|
+
SADBERT,
|
|
23
|
+
get_stereotype_content,
|
|
24
|
+
ALL_CATS,
|
|
25
|
+
MAJOR_CATS,
|
|
26
|
+
MINOR_CATS,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__version__ = "0.1.0"
|
|
30
|
+
__author__ = "Xander Deanhardt"
|
|
31
|
+
__email__ = "xanderd24@uchicago.edu"
|
|
32
|
+
__license__ = "MIT"
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"SADBERT",
|
|
36
|
+
"get_stereotype_content",
|
|
37
|
+
"ALL_CATS",
|
|
38
|
+
"MAJOR_CATS",
|
|
39
|
+
"MINOR_CATS",
|
|
40
|
+
]
|