emb-diversity 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. emb_diversity-0.0.3/LICENSE +21 -0
  2. emb_diversity-0.0.3/PKG-INFO +386 -0
  3. emb_diversity-0.0.3/README.md +346 -0
  4. emb_diversity-0.0.3/pyproject.toml +92 -0
  5. emb_diversity-0.0.3/setup.cfg +4 -0
  6. emb_diversity-0.0.3/src/emb_diversity/__init__.py +63 -0
  7. emb_diversity-0.0.3/src/emb_diversity/_accepts_text.py +104 -0
  8. emb_diversity-0.0.3/src/emb_diversity/_registry.py +51 -0
  9. emb_diversity-0.0.3/src/emb_diversity/axes_registry.py +54 -0
  10. emb_diversity-0.0.3/src/emb_diversity/cli.py +212 -0
  11. emb_diversity-0.0.3/src/emb_diversity/compute_pairwise.py +144 -0
  12. emb_diversity-0.0.3/src/emb_diversity/convenience.py +87 -0
  13. emb_diversity-0.0.3/src/emb_diversity/embed.py +45 -0
  14. emb_diversity-0.0.3/src/emb_diversity/embeddings/SBERT.py +38 -0
  15. emb_diversity-0.0.3/src/emb_diversity/embeddings/SimCSE.py +24 -0
  16. emb_diversity-0.0.3/src/emb_diversity/embeddings/__init__.py +0 -0
  17. emb_diversity-0.0.3/src/emb_diversity/embeddings/embed.py +72 -0
  18. emb_diversity-0.0.3/src/emb_diversity/eval/__init__.py +0 -0
  19. emb_diversity-0.0.3/src/emb_diversity/eval/data/STEL.py +76 -0
  20. emb_diversity-0.0.3/src/emb_diversity/eval/data/__init__.py +0 -0
  21. emb_diversity-0.0.3/src/emb_diversity/eval/data/synthstel.py +137 -0
  22. emb_diversity-0.0.3/src/emb_diversity/evaluate_measures.py +260 -0
  23. emb_diversity-0.0.3/src/emb_diversity/measures/__init__.py +0 -0
  24. emb_diversity-0.0.3/src/emb_diversity/measures/_types.py +14 -0
  25. emb_diversity-0.0.3/src/emb_diversity/measures/bins_entropy.py +169 -0
  26. emb_diversity-0.0.3/src/emb_diversity/measures/bottleneck.py +47 -0
  27. emb_diversity-0.0.3/src/emb_diversity/measures/chamfer_dist.py +64 -0
  28. emb_diversity-0.0.3/src/emb_diversity/measures/cluster_inertia.py +59 -0
  29. emb_diversity-0.0.3/src/emb_diversity/measures/convex_hull_volume_2d.py +109 -0
  30. emb_diversity-0.0.3/src/emb_diversity/measures/dcscore.py +100 -0
  31. emb_diversity-0.0.3/src/emb_diversity/measures/diameter.py +37 -0
  32. emb_diversity-0.0.3/src/emb_diversity/measures/dist_dispersion.py +38 -0
  33. emb_diversity-0.0.3/src/emb_diversity/measures/energy.py +50 -0
  34. emb_diversity-0.0.3/src/emb_diversity/measures/graph_entropy.py +81 -0
  35. emb_diversity-0.0.3/src/emb_diversity/measures/hamdiv.py +107 -0
  36. emb_diversity-0.0.3/src/emb_diversity/measures/log_determinant.py +128 -0
  37. emb_diversity-0.0.3/src/emb_diversity/measures/mean_pw_dist.py +37 -0
  38. emb_diversity-0.0.3/src/emb_diversity/measures/mst_dispersion.py +47 -0
  39. emb_diversity-0.0.3/src/emb_diversity/measures/radius.py +52 -0
  40. emb_diversity-0.0.3/src/emb_diversity/measures/renyi_entropy.py +140 -0
  41. emb_diversity-0.0.3/src/emb_diversity/measures/span_centroid.py +55 -0
  42. emb_diversity-0.0.3/src/emb_diversity/measures/span_medoid.py +42 -0
  43. emb_diversity-0.0.3/src/emb_diversity/measures/sum_bottleneck.py +56 -0
  44. emb_diversity-0.0.3/src/emb_diversity/measures/sum_diameter.py +55 -0
  45. emb_diversity-0.0.3/src/emb_diversity/measures/utils.py +27 -0
  46. emb_diversity-0.0.3/src/emb_diversity/measures/vendi_score.py +76 -0
  47. emb_diversity-0.0.3/src/emb_diversity/measures_registry.py +68 -0
  48. emb_diversity-0.0.3/src/emb_diversity/plot/__init__.py +0 -0
  49. emb_diversity-0.0.3/src/emb_diversity/two_d.py +230 -0
  50. emb_diversity-0.0.3/src/emb_diversity/utility/__init__.py +3 -0
  51. emb_diversity-0.0.3/src/emb_diversity/utility/_cache.py +85 -0
  52. emb_diversity-0.0.3/src/emb_diversity/utility/project_root.py +17 -0
  53. emb_diversity-0.0.3/src/emb_diversity.egg-info/PKG-INFO +386 -0
  54. emb_diversity-0.0.3/src/emb_diversity.egg-info/SOURCES.txt +64 -0
  55. emb_diversity-0.0.3/src/emb_diversity.egg-info/dependency_links.txt +1 -0
  56. emb_diversity-0.0.3/src/emb_diversity.egg-info/entry_points.txt +2 -0
  57. emb_diversity-0.0.3/src/emb_diversity.egg-info/requires.txt +11 -0
  58. emb_diversity-0.0.3/src/emb_diversity.egg-info/top_level.txt +1 -0
  59. emb_diversity-0.0.3/test/test_SBERT.py +158 -0
  60. emb_diversity-0.0.3/test/test_all_measures.py +33 -0
  61. emb_diversity-0.0.3/test/test_compute_pairwise_cache.py +193 -0
  62. emb_diversity-0.0.3/test/test_convenience.py +33 -0
  63. emb_diversity-0.0.3/test/test_diversity.py +967 -0
  64. emb_diversity-0.0.3/test/test_embed_cache.py +235 -0
  65. emb_diversity-0.0.3/test/test_measure_sets.py +50 -0
  66. emb_diversity-0.0.3/test/test_two_d.py +108 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Cantao Su, Anna Wegmann, Menan Velayuthan, Dong Nguyen, Esther Ploeger
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,386 @@
1
+ Metadata-Version: 2.4
2
+ Name: emb-diversity
3
+ Version: 0.0.3
4
+ Summary: A package for measuring diversity in text and vector data
5
+ Author-email: Cantao Su <c.su@uu.nl>, Menan Velayuthan <m.velayuthan@uu.nl>, Esther Ploeger <e.ploeger@uu.nl>, Dong Nguyen <d.p.nguyen@uu.nl>, Anna Wegmann <a.m.wegmann@uu.nl>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/nlpsoc/Diversity-Measurement
8
+ Project-URL: Documentation, https://nlpsoc.github.io/Diversity-Measurement/
9
+ Project-URL: Repository, https://github.com/nlpsoc/Diversity-Measurement
10
+ Project-URL: Bug Tracker, https://github.com/nlpsoc/Diversity-Measurement/issues
11
+ Keywords: diversity,nlp,embeddings,emb-diversity,metrics
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Scientific/Engineering
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Text Processing :: Linguistic
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
23
+ Classifier: Operating System :: OS Independent
24
+ Classifier: Natural Language :: English
25
+ Requires-Python: >=3.11
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: datasets~=3.3.2
29
+ Requires-Dist: matplotlib~=3.10.0
30
+ Requires-Dist: networkx>=2.8.8
31
+ Requires-Dist: numpy~=2.3.5
32
+ Requires-Dist: pandas~=2.3.1
33
+ Requires-Dist: scikit-learn~=1.7.1
34
+ Requires-Dist: scipy~=1.16.0
35
+ Requires-Dist: sentence-transformers>=2.7.0
36
+ Requires-Dist: typer>=0.9.0
37
+ Requires-Dist: umap-learn>=0.5.9
38
+ Requires-Dist: vendi-score>=0.0.3
39
+ Dynamic: license-file
40
+
41
+ # emb-diversity
42
+
43
+ <!-- docs-intro-start -->
44
+ A Python package for measuring data diversity on small- to medium-sized text datasets. All measures are calculating diversity based on embeddings, i.e., vector representations of your data. Depending on what embedding models you want to use, you are able to calculate semantic, stylistic and other types of diversity with our package.
45
+
46
+ This library is developed as part of the [DataDivers](https://datadivers-erc.github.io/) project.
47
+ <!-- docs-intro-end -->
48
+
49
+ 📖 **Documentation:** <https://nlpsoc.github.io/Diversity-Measurement/>
50
+
51
+ ## Install
52
+
53
+ <!-- docs-install-start -->
54
+ Install the latest release from PyPI:
55
+
56
+ ```bash
57
+ pip install emb-diversity
58
+ ```
59
+
60
+ The first time you measure diversity, the default embedding model
61
+ (`all-mpnet-base-v2`, ~420 MB) is downloaded from the Hugging Face Hub and
62
+ cached locally, so later runs are fast and work offline.
63
+ <!-- docs-install-end -->
64
+
65
+ ## Usage
66
+
67
+ <!-- docs-quickstart-start -->
68
+
69
+ Measuring the diversity of a dataset with our package is easy:
70
+
71
+ ```python
72
+ from emb_diversity import measure_diversity
73
+
74
+ # more style-diverse, more topic-uniform (music)
75
+ texts_a = [
76
+ "I thoroughly enjoy the hair bands.",
77
+ "songs of the 80's are the best.",
78
+ "Hip Hop is going DOWNHILL!!!!!",
79
+ "rock music just makes me feel good",
80
+ "The 80's rocked!That generation had the best music!"
81
+ ]
82
+
83
+ # Uses the default measures and semantic embeddings
84
+ print(measure_diversity(texts_a))
85
+ # -> {'graph_entropy': 6.86..., 'vendi_score': 4.12..., 'mean_pw_dist': 0.69...}
86
+ ```
87
+
88
+ Note that measuring the diversity of a dataset is usually only meaningful when comparing it to another datasets. The reason is that diversity values in isolation are not easily interpretable and are not bounded, sensitive to dataset size and sensitive to the used embedding space. Let's add another corpus.
89
+
90
+ ```python
91
+ # more style-uniform (formal), more topic-diverse
92
+ texts_b = [
93
+ "I thoroughly enjoy the hair bands.",
94
+ "They have not caused any harm to me.",
95
+ "He has a very distinct walk.",
96
+ "It depends on what they will pay.",
97
+ "I would go out with the son of a preacher.",
98
+ ]
99
+
100
+ print(measure_diversity(texts_a))
101
+ # -> {'graph_entropy': 6.86..., 'vendi_score': 4.12..., 'mean_pw_dist': 0.69...}
102
+
103
+ print(measure_diversity(texts_b))
104
+ # -> {'graph_entropy': 6.91..., 'vendi_score': 4.93..., 'mean_pw_dist': 0.98...}
105
+ ```
106
+
107
+ When a measure considers a dataset to be more diverse, it will assign it a higher diversity value. Here, the three default measures consistently show that `texts_b` is more diverse than `texts_a`. This can change, when we change what diversity "axis" is considered, for example, "style" instead of "semantic".
108
+
109
+ ```python
110
+ # Use a different diversity axis, for style diversity AnnaWegmann/style-embeddings is the default
111
+ print(measure_diversity(texts_a, diversity_axis="style"))
112
+ # -> {'graph_entropy': 6.69..., 'vendi_score': 4.17..., 'mean_pw_dist': 0.93...}
113
+ print(measure_diversity(texts_b, diversity_axis="style"))
114
+ # -> {'graph_entropy': 6.32..., 'vendi_score': 2.24..., 'mean_pw_dist': 0.32...}
115
+ ```
116
+
117
+ You can also specify a different embedding model with a HuggingFace identifier, for example, a model trained for Dutch. Be careful to use models that were trained on the diversity axis you are interested in, otherwise you might get some inconsistent results!
118
+
119
+ ```python
120
+ # Use a specific embedding model (here a small, fast SBERT model)
121
+ print(measure_diversity(texts_a, embedding_model="GroNLP/bert-base-dutch-cased"))
122
+ # -> {'graph_entropy': 6.61..., 'vendi_score': 1.89..., 'mean_pw_dist': 0.20...}
123
+ print(measure_diversity(texts_b, embedding_model="GroNLP/bert-base-dutch-cased"))
124
+ # -> {'graph_entropy': 6.80..., 'vendi_score': 1.52..., 'mean_pw_dist': 0.11...}
125
+ ```
126
+
127
+ You can also use specific measures, see an overview here: https://nlpsoc.github.io/Diversity-Measurement/user-guide/measures.html. Use with caution. Some measures might be worse for your use case than others. We recommend to test whether your chosen measure and embedding space capture your diversity axis of interest.
128
+ ```python
129
+ # Run specific measures
130
+ print(measure_diversity(texts_a, measure=["diameter", "log_determinant"]))
131
+ # -> {'diameter': 0.94..., 'log_determinant': -0.93...}
132
+ print(measure_diversity(texts_b, measure=["diameter", "log_determinant"]))
133
+ # -> {'diameter': 1.0..., 'log_determinant': -0.06...}
134
+ ```
135
+
136
+ Note that most measures return unbounded values that cannot be compared for datasets with differing sizes. Happy diversity measuring!
137
+ <!-- docs-quickstart-end -->
138
+
139
+ ## Table of Contents
140
+
141
+ - [Install](#install)
142
+ - [Usage](#usage)
143
+ - [Development](#development)
144
+ - [Development setup](#development-setup)
145
+ - [Suggested Workflow for Collaboration](#suggested-workflow-for-collaboration)
146
+ - [Working with uv](#working-with-uv)
147
+ - [Docstring Style Guide](#docstring-style-guide)
148
+ - [Adding New Measures](#adding-new-measures)
149
+ - [Adding New Diversity Axes](#adding-new-diversity-axes)
150
+ - [Building and publishing a release](#building-and-publishing-a-release)
151
+ - [Funding](#funding)
152
+ - [Citation](#citation)
153
+
154
+ ## Development
155
+
156
+ ### Development setup
157
+
158
+ To work on `emb-diversity` itself, install from a clone with
159
+ [`uv`](https://docs.astral.sh/uv/getting-started/installation/):
160
+
161
+ ```bash
162
+ git clone https://github.com/nlpsoc/Diversity-Measurement.git
163
+ cd Diversity-Measurement
164
+ uv sync --group dev # runtime + dev tools (pytest, docs, ...)
165
+ source .venv/bin/activate
166
+ ```
167
+
168
+ Use `uv sync --no-group dev` to install only the runtime dependencies.
169
+
170
+ ### Suggested Workflow for Collaboration
171
+
172
+ 1. **Create a new branch** for your feature or bug fix:
173
+ ```bash
174
+ git checkout -b feature/my-feature
175
+ ```
176
+ 2. **Make your changes** in the codebase.
177
+ 3. **Run tests** to ensure everything works as expected:
178
+ ```bash
179
+ pytest
180
+ ```
181
+ 4. **Commit your changes** with a descriptive message:
182
+ ```bash
183
+ git add .
184
+ git commit -m "Add feature X"
185
+ ```
186
+ 5. **Push your branch** to the remote repository:
187
+ ```bash
188
+ git push origin feature/my-feature
189
+ ```
190
+ 6. **Create a pull request** on GitHub to merge your changes into the main branch and request a review from your team members.
191
+ 7. **Address any feedback** from the review process.
192
+ 8. Once approved, **merge your pull request** into the main branch.
193
+ 9. **Delete your branch** after merging to keep the repository clean:
194
+ ```bash
195
+ git branch -d feature/my-feature
196
+ git push origin --delete feature/my-feature
197
+ ```
198
+
199
+ ### Working with uv
200
+
201
+ #### Adding Packages with `uv add`
202
+
203
+ To add packages to your project, always use `uv add` rather than `uv pip install`. This ensures that your dependencies are properly managed and recorded in your `pyproject.toml`. For example:
204
+
205
+ ```bash
206
+ uv add <package-name>
207
+ ```
208
+
209
+ #### Adding Packages to a Dev Group
210
+
211
+ If you need to add a package specifically to your development environment, you can add it to the `dev` group like this:
212
+
213
+ ```bash
214
+ uv add --group dev <package-name>
215
+ ```
216
+
217
+ #### Switching Between Dev and Standard Mode
218
+
219
+ After you are done with testing and want to go back to standard mode, run:
220
+
221
+ ```bash
222
+ uv sync --no-group dev
223
+ ```
224
+
225
+ This will disable all additional groups and just load your main project dependencies.
226
+
227
+ #### Best Practice: Run `uv lock -U`
228
+
229
+ Whenever you upgrade, downgrade, or change versions of packages, it's a good practice to run:
230
+
231
+ ```bash
232
+ uv lock -U
233
+ ```
234
+
235
+ This updates your `uv.lock` file to ensure all versions are consistent and everything is in sync.
236
+
237
+ ### Docstring Style Guide
238
+
239
+ This project uses **Google-style docstrings** which are automatically parsed by the Sphinx Napoleon extension.
240
+
241
+ #### Functions and Methods
242
+
243
+ ```python
244
+ def calculate_diversity(vectors: np.ndarray, method: str = "vendi") -> float:
245
+ """Calculate diversity score for a set of vectors.
246
+
247
+ This function computes various diversity metrics for vector representations.
248
+ The default method uses the Vendi Score which is based on matrix entropy.
249
+
250
+ References:
251
+ Cox, Samuel Rhys, Yunlong Wang, Ashraf Abdul, Christian von der Weth, and Brian Y. Lim. "Directed Diversity: Leveraging Language Embedding Distances for Collective Creativity in Crowd Ideation." Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, May 6, 2021, 1–35. https://doi.org/10.1145/3411764.3445782.
252
+
253
+ Args:
254
+ vectors: Array of shape (n_samples, n_features) containing the vectors.
255
+ method: Diversity calculation method. Options are "vendi", "entropy",
256
+ or "distinctness". Defaults to "vendi".
257
+
258
+ Returns:
259
+ Diversity score as a float between 0 and 1, where higher values
260
+ indicate greater diversity.
261
+
262
+ Raises:
263
+ ValueError: If vectors array is empty or method is not recognized.
264
+
265
+ Example:
266
+ >>> vectors = np.array([[1, 0], [0, 1], [1, 1]])
267
+ >>> score = calculate_diversity(vectors)
268
+ >>> print(f"Diversity: {score:.2f}")
269
+ Diversity: 0.87
270
+ """
271
+ pass
272
+ ```
273
+
274
+ #### Key Points
275
+
276
+ - **One-line summary**: Start with a brief summary in imperative mood ("Calculate", not "Calculates")
277
+ - **Blank line**: After the summary, add a blank line before any detailed description
278
+ - **References**: Add related papers
279
+ - **Args**: Document each parameter with type information
280
+ - **Returns**: Describe what the function returns
281
+ - **Raises**: Document exceptions that might be raised
282
+ - **Example**: Include usage examples when helpful
283
+ - **Type hints**: Use type hints in function signatures AND document them in docstrings
284
+
285
+ #### Section Headers
286
+
287
+ Use these section headers in docstrings:
288
+ - `References:` Related papers
289
+ - `Args:` — Function/method parameters
290
+ - `Returns:` — Return value description
291
+ - `Raises:` — Exceptions that may be raised
292
+ - `Yields:` — For generators
293
+ - `Attributes:` — For class attributes
294
+ - `Example:` or `Examples:` — Usage examples
295
+ - `Note:` — Important notes
296
+ - `Warning:` — Warnings about usage
297
+
298
+ Further reading: [Google Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) · [Sphinx Napoleon docs](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html)
299
+
300
+ ### Adding New Measures
301
+
302
+ When you add a new measure to `src/emb_diversity/measures/`:
303
+
304
+ 1. Create a new file with the function decorated with `@accepts_text` and a complete docstring following the style guide above.
305
+ 2. Export it from `src/emb_diversity/__init__.py` if it should be part of the public API.
306
+ 3. Register it in `src/emb_diversity/measures_registry.py` with `measures.register("name", func)`.
307
+ 4. **Update `docs/source/user-guide/measures.md`** — add a row for the new measure in the appropriate table.
308
+
309
+ ### Adding New Diversity Axes
310
+
311
+ Register a new axis in `src/emb_diversity/axes_registry.py`:
312
+
313
+ ```python
314
+ from emb_diversity.axes_registry import DiversityAxis, axes
315
+
316
+ axes.register(
317
+ "multilingual",
318
+ DiversityAxis(
319
+ name="multilingual",
320
+ default_model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
321
+ description="Cross-lingual semantic diversity",
322
+ ),
323
+ )
324
+ ```
325
+
326
+ Update `docs/source/user-guide/axes.md` with the new axis.
327
+
328
+ ### Building and publishing a release
329
+
330
+ Releases are published by CI via PyPI Trusted Publishing (no API token is
331
+ stored), in two stages — a TestPyPI dry run, then production PyPI:
332
+
333
+ 1. **Bump `version`** in `pyproject.toml`, commit, and merge to `main`. A version
334
+ number can be uploaded only once, so every release needs a new number — you
335
+ cannot re-publish or overwrite an existing version.
336
+ 2. **Tag and push → TestPyPI.** Pushing a `v*` tag triggers `publish-testpypi.yml`
337
+ (it checks the tag matches the `pyproject.toml` version):
338
+ ```bash
339
+ git tag v0.0.1 # must match the version in pyproject.toml
340
+ git push origin v0.0.1
341
+ ```
342
+ Verify the result at <https://test.pypi.org/project/emb-diversity/>.
343
+ 3. **Create a GitHub Release → PyPI.** When the TestPyPI run looks good, create a
344
+ GitHub Release for the tag. That triggers `publish-pypi.yml`, which uploads to
345
+ real PyPI (<https://pypi.org/project/emb-diversity/>). Create the release either:
346
+ - **on GitHub:** go to the repository's **Releases** page (right-hand sidebar of
347
+ the repo, or `.../releases`) → **Draft a new release** → under *Choose a tag*
348
+ pick the existing tag (e.g. `v0.0.1`) → add a title and notes → **Publish
349
+ release**; or
350
+ - **with the GitHub CLI:**
351
+ ```bash
352
+ gh release create v0.0.1 --title "v0.0.1" --notes "First release"
353
+ ```
354
+
355
+ Publishing the release (not just drafting it) is what triggers the workflow.
356
+
357
+ To build and validate **locally** before tagging (optional):
358
+
359
+ ```bash
360
+ rm -rf dist # clear artifacts from previous versions first
361
+ uv build # -> dist/emb_diversity-<version>.{tar.gz,whl}
362
+ uvx twine check dist/* # validate metadata + that the README renders on PyPI
363
+ ```
364
+
365
+ `uv build` only *adds* to `dist/`, so clear it first when building a new version —
366
+ otherwise old artifacts linger and an upload would try (and fail) to re-publish
367
+ them. CI doesn't need this: each run starts from a clean checkout.
368
+
369
+ ## Funding
370
+
371
+ This work is supported by the ERC Starting Grant **DataDivers** (101162980).
372
+
373
+ ## Citation
374
+
375
+ <!-- docs-citation-start -->
376
+ There is no paper yet, so if you use `emb-diversity` in your work, please cite
377
+ the software:
378
+
379
+ ```bibtex
380
+ @misc{emb_diversity,
381
+ author = {Su, Cantao and Velayuthan, Menan and Ploeger, Esther and Nguyen, Dong and Wegmann, Anna},
382
+ title = {emb-diversity},
383
+ url = {https://github.com/nlpsoc/Diversity-Measurement},
384
+ }
385
+ ```
386
+ <!-- docs-citation-end -->