cooplot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. cooplot-0.1.0/.gitattributes +1 -0
  2. cooplot-0.1.0/.github/agberens.png +0 -0
  3. cooplot-0.1.0/.github/excelclust.png +0 -0
  4. cooplot-0.1.0/.github/workflows/python-publish.yml +70 -0
  5. cooplot-0.1.0/.gitignore +31 -0
  6. cooplot-0.1.0/PKG-INFO +121 -0
  7. cooplot-0.1.0/README.md +102 -0
  8. cooplot-0.1.0/cooplot/__init__.py +1 -0
  9. cooplot-0.1.0/cooplot/aggregate.py +295 -0
  10. cooplot-0.1.0/cooplot/api.py +190 -0
  11. cooplot-0.1.0/cooplot/build.py +108 -0
  12. cooplot-0.1.0/cooplot/io.py +27 -0
  13. cooplot-0.1.0/cooplot/metrics.py +1459 -0
  14. cooplot-0.1.0/cooplot/scrape.py +226 -0
  15. cooplot-0.1.0/cooplot/viz.py +643 -0
  16. cooplot-0.1.0/notebooks/agberens/Aleksejs Tim/304/215enko.json" +22 -0
  17. cooplot-0.1.0/notebooks/agberens/Camila Roa.json +1 -0
  18. cooplot-0.1.0/notebooks/agberens/Christian Behrens.json +67 -0
  19. cooplot-0.1.0/notebooks/agberens/Dmitry Kobak.json +272 -0
  20. cooplot-0.1.0/notebooks/agberens/Fabio Seel.json +1 -0
  21. cooplot-0.1.0/notebooks/agberens/Ifeoma Veronica Nwabufo.json +17 -0
  22. cooplot-0.1.0/notebooks/agberens/Indu Ilanchezian.json +42 -0
  23. cooplot-0.1.0/notebooks/agberens/Jan Lause.json +42 -0
  24. cooplot-0.1.0/notebooks/agberens/Jan Niklas B/303/266hm.json" +52 -0
  25. cooplot-0.1.0/notebooks/agberens/Jonas Beck.json +57 -0
  26. cooplot-0.1.0/notebooks/agberens/Jonathan Oesterle.json +62 -0
  27. cooplot-0.1.0/notebooks/agberens/Julius Gervelmeyer.json +22 -0
  28. cooplot-0.1.0/notebooks/agberens/Kerol Djoumessi.json +47 -0
  29. cooplot-0.1.0/notebooks/agberens/Kyra Kadhim.json +22 -0
  30. cooplot-0.1.0/notebooks/agberens/Lisa Koch.json +232 -0
  31. cooplot-0.1.0/notebooks/agberens/Lisa Schmors.json +52 -0
  32. cooplot-0.1.0/notebooks/agberens/Luke Rogerson.json +62 -0
  33. cooplot-0.1.0/notebooks/agberens/Murat Se/303/247kin Ayhan.json" +127 -0
  34. cooplot-0.1.0/notebooks/agberens/Patrick K/303/266hler.json" +12 -0
  35. cooplot-0.1.0/notebooks/agberens/Philipp Berens.json +782 -0
  36. cooplot-0.1.0/notebooks/agberens/Rita Gonz/303/241lez M/303/241rquez.json" +62 -0
  37. cooplot-0.1.0/notebooks/agberens/Sacha Sokoloski.json +82 -0
  38. cooplot-0.1.0/notebooks/agberens/Samuel Ofosu Mensah.json +37 -0
  39. cooplot-0.1.0/notebooks/agberens/Sarah M/303/274ller.json" +72 -0
  40. cooplot-0.1.0/notebooks/agberens/Sarah Strau/303/237.json" +17 -0
  41. cooplot-0.1.0/notebooks/agberens/Sebastian Damrich.json +107 -0
  42. cooplot-0.1.0/notebooks/agberens/Simone Ebert.json +42 -0
  43. cooplot-0.1.0/notebooks/agberens/Sophie Laturnus.json +57 -0
  44. cooplot-0.1.0/notebooks/agberens/Verena Jasmin Hallitschke.json +7 -0
  45. cooplot-0.1.0/notebooks/agberens/Yves Bernaerts.json +47 -0
  46. cooplot-0.1.0/notebooks/agberens/Ziwei Huang.json +47 -0
  47. cooplot-0.1.0/notebooks/agberens-coop-ref.txt +179 -0
  48. cooplot-0.1.0/notebooks/agberens-coop.csv +86 -0
  49. cooplot-0.1.0/notebooks/agberens-grouped/All.json +1266 -0
  50. cooplot-0.1.0/notebooks/agberens-grouped/Embedding.json +681 -0
  51. cooplot-0.1.0/notebooks/agberens-grouped/MedML.json +772 -0
  52. cooplot-0.1.0/notebooks/agberens-grouped/Neural.json +822 -0
  53. cooplot-0.1.0/notebooks/agberens.csv +32 -0
  54. cooplot-0.1.0/notebooks/agberens.ipynb +275 -0
  55. cooplot-0.1.0/pyproject.toml +37 -0
  56. cooplot-0.1.0/tests/__init__.py +0 -0
  57. cooplot-0.1.0/tests/test_aggregate_metrics.py +874 -0
  58. cooplot-0.1.0/tests/test_pubmed_live.py +30 -0
  59. cooplot-0.1.0/uv.lock +2603 -0
@@ -0,0 +1 @@
1
+ *.ipynb linguist-documentation
Binary file
Binary file
@@ -0,0 +1,70 @@
1
+ # This workflow will upload a Python Package to PyPI when a release is created
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3
+
4
+ # This workflow uses actions that are not certified by GitHub.
5
+ # They are provided by a third-party and are governed by
6
+ # separate terms of service, privacy policy, and support
7
+ # documentation.
8
+
9
+ name: Upload Python Package
10
+
11
+ on:
12
+ release:
13
+ types: [published]
14
+
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ release-build:
20
+ runs-on: ubuntu-latest
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.x"
28
+
29
+ - name: Build release distributions
30
+ run: |
31
+ # NOTE: put your own distribution build steps here.
32
+ python -m pip install build
33
+ python -m build
34
+
35
+ - name: Upload distributions
36
+ uses: actions/upload-artifact@v4
37
+ with:
38
+ name: release-dists
39
+ path: dist/
40
+
41
+ pypi-publish:
42
+ runs-on: ubuntu-latest
43
+ needs:
44
+ - release-build
45
+ permissions:
46
+ # IMPORTANT: this permission is mandatory for trusted publishing
47
+ id-token: write
48
+
49
+ # Dedicated environments with protections for publishing are strongly recommended.
50
+ # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
51
+ environment:
52
+ name: pypi
53
+ # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
54
+ # url: https://pypi.org/p/cooplot
55
+ #
56
+ # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
57
+ # ALTERNATIVE: exactly, uncomment the following line instead:
58
+ # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
59
+
60
+ steps:
61
+ - name: Retrieve release distributions
62
+ uses: actions/download-artifact@v4
63
+ with:
64
+ name: release-dists
65
+ path: dist/
66
+
67
+ - name: Publish release distributions to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
69
+ with:
70
+ packages-dir: dist/
@@ -0,0 +1,31 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ .pytest_cache
6
+
7
+ # Distribution / packaging
8
+ build/
9
+ dist/
10
+ *.egg-info/
11
+ *.egg
12
+
13
+ # Environments
14
+ .env
15
+ .venv
16
+ env/
17
+ venv/
18
+
19
+ # Jupyter Notebook
20
+ .ipynb_checkpoints
21
+
22
+ # macOS
23
+ .DS_Store
24
+
25
+ # Editor directories and files
26
+ .idea/
27
+ .vscode/
28
+ *.swp
29
+ *.swo
30
+
31
+ dev/
cooplot-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.4
2
+ Name: cooplot
3
+ Version: 0.1.0
4
+ Summary: Co-op between members and subgroups.
5
+ Requires-Python: >=3.10.0
6
+ Requires-Dist: matplotlib>=3.10.6
7
+ Requires-Dist: mne-connectivity>=0.7.0
8
+ Requires-Dist: numpy>=2.2.6
9
+ Requires-Dist: python-dotenv>=1.0
10
+ Requires-Dist: requests>=2.32.5
11
+ Requires-Dist: scholarly>=1.7.11
12
+ Requires-Dist: tqdm>=4.67.1
13
+ Provides-Extra: dev
14
+ Requires-Dist: maturin; extra == 'dev'
15
+ Requires-Dist: pytest; extra == 'dev'
16
+ Requires-Dist: ruff; extra == 'dev'
17
+ Requires-Dist: twine; extra == 'dev'
18
+ Description-Content-Type: text/markdown
19
+
20
+ # cooplot
21
+
22
+ Analysis of co-op between members and subgroups.
23
+
24
+ ## Example Gallery
25
+
26
+ ## Cluster of Excellence – Machine Learning for Science
27
+
28
+ ```python
29
+ import cooplot
30
+ palette = {
31
+ "Life Science": "#61859e",
32
+ "Norms": "#e0aa41",
33
+ "Human Science": "#5fb4d0",
34
+ "ML": "#bc3b2f",
35
+ "Physical Science": "#608dd2",
36
+ }
37
+
38
+ _, people = cooplot.load_csv("excelclust.csv", delimiter=";")
39
+ pubs = cooplot.scrape(
40
+ people,
41
+ name_col="name",
42
+ scholar_col="scholar_id",
43
+ semantic_col="semantic_id",
44
+ cache_dir=".cache/excelclust",
45
+ )
46
+ mats = cooplot.build(pubs, windows=["2014-2018", "2019-2023"], name_col="name", group_col="group")
47
+ fig = cooplot.show(mats, group_col="group", style="circle", heatmap_counts=True, palette=palette)
48
+ fig.savefig("../.github/excelclust.png", dpi=300)
49
+ ```
50
+
51
+ ![](.github/excelclust.png)
52
+
53
+ ## AG Berens
54
+
55
+ ```python
56
+ import cooplot
57
+ header, people = cooplot.load_csv("agberens.csv", delimiter=";")
58
+ pubs = cooplot.scrape(
59
+ people,
60
+ name_col="name",
61
+ scholar_col="scholar_id",
62
+ semantic_col="semantic_id",
63
+ cache_dir=".cache/hai",
64
+ drop_subtitle=False,
65
+ fallback_semantic_if_empty=True, # try Semantic if GS had 0 pubs
66
+ )
67
+
68
+
69
+ mats = cooplot.build(pubs, windows=["2016-2025"], name_col="name", group_col="group")
70
+ fig = cooplot.show(mats, group_col="group", style="both", heatmap_counts=True, figsize=(18,10))
71
+ ```
72
+
73
+ ![](.github/agberens.png)
74
+
75
+ You can also generate a list of reference highlighting the cross-group collaboration:
76
+
77
+ ```python
78
+ groups = cooplot.aggregate(pubs, name_col="name", cache_dir=".cache/agberens-group")
79
+ _ = cooplot.cross_group_publications(groups, year_from=2016, year_to=2025, out_path=".cache/agberens-coop.csv", enrich_crossref=True)
80
+ cooplot.cross_group_report(".cache/agberens-coop.csv", out_path=".cache/agberens-coop-ref.txt", verbose=True)
81
+ ```
82
+
83
+ which will give you
84
+
85
+ ```
86
+ ...
87
+
88
+ Schmors, L., Kotkat, A. H., Bauer, Y., Huang, Z., Crombie, D., Meyerolbersleben, L. S., Sokoloski, S., Berens, P., & Busse, L. (2025). Effects of corticothalamic feedback depend on visual responsiveness and stimulus type. IScience, 28(6), 112481. https://doi.org/10.1016/j.isci.2025.112481
89
+ Collaboration: All (Philipp Berens, Ziwei Huang) and Neural (Lisa Schmors, Sacha Sokoloski).
90
+
91
+ Gervelmeyer, J., Müller, S., Huang, Z., & Berens, P. (2025). Fundus Image Toolbox: A Python package for fundus image processing. Journal of Open Source Software, 10(108), 7101. https://doi.org/10.21105/joss.07101
92
+ Collaboration: All (Philipp Berens, Ziwei Huang) and MedML (Julius Gervelmeyer, Sarah Müller).
93
+
94
+ Schmidt, G., Heidrich, H., Berens, P., & Müller, S. (2025). Learning Disease State from Noisy Ordinal Disease Progression Labels. Medical Image Computing and Computer Assisted Intervention – MICCAI 2025, 284–293. https://doi.org/10.1007/978-3-032-04971-1_27
95
+ Collaboration: All (Philipp Berens) and MedML (Sarah Müller).
96
+
97
+ Ofosu Mensah, S., Djoumessi, K., & Berens, P. (2025). Prototype-Guided and Lightweight Adapters for Inherent Interpretation and Generalisation in Federated Learning. Medical Image Computing and Computer Assisted Intervention – MICCAI 2025, 464–473. https://doi.org/10.1007/978-3-032-04981-0_44
98
+ Collaboration: All (Philipp Berens) and MedML (Kerol Djoumessi, Samuel Ofosu Mensah).
99
+
100
+ Oesterle, J., Ran, Y., Stahr, P., Kerr, J. N. D., Schubert, T., Berens, P., & Euler, T. (2025). Task-specific regional circuit adaptations in distinct mouse retinal ganglion cells. Science Advances, 11(17). https://doi.org/10.1126/sciadv.adp7075
101
+ Collaboration: All (Philipp Berens) and Neural (Jonathan Oesterle).
102
+ ```
103
+
104
+
105
+ ## Installation
106
+
107
+ ```bash
108
+ uv pip install cooplot
109
+ ```
110
+
111
+ or
112
+
113
+ ```bash
114
+ git clone git@github.com:berenslab/cooplot.git
115
+ cd cooplot
116
+ uv pip install -e ".[dev]"
117
+ ```
118
+
119
+ ## Usage
120
+
121
+ See the [example notebook](https://github.com/berenslab/cooplot/blob/main/notebooks/agberens.ipynb) for a complete usage example.
@@ -0,0 +1,102 @@
1
+ # cooplot
2
+
3
+ Analysis of co-op between members and subgroups.
4
+
5
+ ## Example Gallery
6
+
7
+ ## Cluster of Excellence – Machine Learning for Science
8
+
9
+ ```python
10
+ import cooplot
11
+ palette = {
12
+ "Life Science": "#61859e",
13
+ "Norms": "#e0aa41",
14
+ "Human Science": "#5fb4d0",
15
+ "ML": "#bc3b2f",
16
+ "Physical Science": "#608dd2",
17
+ }
18
+
19
+ _, people = cooplot.load_csv("excelclust.csv", delimiter=";")
20
+ pubs = cooplot.scrape(
21
+ people,
22
+ name_col="name",
23
+ scholar_col="scholar_id",
24
+ semantic_col="semantic_id",
25
+ cache_dir=".cache/excelclust",
26
+ )
27
+ mats = cooplot.build(pubs, windows=["2014-2018", "2019-2023"], name_col="name", group_col="group")
28
+ fig = cooplot.show(mats, group_col="group", style="circle", heatmap_counts=True, palette=palette)
29
+ fig.savefig("../.github/excelclust.png", dpi=300)
30
+ ```
31
+
32
+ ![](.github/excelclust.png)
33
+
34
+ ## AG Berens
35
+
36
+ ```python
37
+ import cooplot
38
+ header, people = cooplot.load_csv("agberens.csv", delimiter=";")
39
+ pubs = cooplot.scrape(
40
+ people,
41
+ name_col="name",
42
+ scholar_col="scholar_id",
43
+ semantic_col="semantic_id",
44
+ cache_dir=".cache/hai",
45
+ drop_subtitle=False,
46
+ fallback_semantic_if_empty=True, # try Semantic if GS had 0 pubs
47
+ )
48
+
49
+
50
+ mats = cooplot.build(pubs, windows=["2016-2025"], name_col="name", group_col="group")
51
+ fig = cooplot.show(mats, group_col="group", style="both", heatmap_counts=True, figsize=(18,10))
52
+ ```
53
+
54
+ ![](.github/agberens.png)
55
+
56
+ You can also generate a list of reference highlighting the cross-group collaboration:
57
+
58
+ ```python
59
+ groups = cooplot.aggregate(pubs, name_col="name", cache_dir=".cache/agberens-group")
60
+ _ = cooplot.cross_group_publications(groups, year_from=2016, year_to=2025, out_path=".cache/agberens-coop.csv", enrich_crossref=True)
61
+ cooplot.cross_group_report(".cache/agberens-coop.csv", out_path=".cache/agberens-coop-ref.txt", verbose=True)
62
+ ```
63
+
64
+ which will give you
65
+
66
+ ```
67
+ ...
68
+
69
+ Schmors, L., Kotkat, A. H., Bauer, Y., Huang, Z., Crombie, D., Meyerolbersleben, L. S., Sokoloski, S., Berens, P., & Busse, L. (2025). Effects of corticothalamic feedback depend on visual responsiveness and stimulus type. IScience, 28(6), 112481. https://doi.org/10.1016/j.isci.2025.112481
70
+ Collaboration: All (Philipp Berens, Ziwei Huang) and Neural (Lisa Schmors, Sacha Sokoloski).
71
+
72
+ Gervelmeyer, J., Müller, S., Huang, Z., & Berens, P. (2025). Fundus Image Toolbox: A Python package for fundus image processing. Journal of Open Source Software, 10(108), 7101. https://doi.org/10.21105/joss.07101
73
+ Collaboration: All (Philipp Berens, Ziwei Huang) and MedML (Julius Gervelmeyer, Sarah Müller).
74
+
75
+ Schmidt, G., Heidrich, H., Berens, P., & Müller, S. (2025). Learning Disease State from Noisy Ordinal Disease Progression Labels. Medical Image Computing and Computer Assisted Intervention – MICCAI 2025, 284–293. https://doi.org/10.1007/978-3-032-04971-1_27
76
+ Collaboration: All (Philipp Berens) and MedML (Sarah Müller).
77
+
78
+ Ofosu Mensah, S., Djoumessi, K., & Berens, P. (2025). Prototype-Guided and Lightweight Adapters for Inherent Interpretation and Generalisation in Federated Learning. Medical Image Computing and Computer Assisted Intervention – MICCAI 2025, 464–473. https://doi.org/10.1007/978-3-032-04981-0_44
79
+ Collaboration: All (Philipp Berens) and MedML (Kerol Djoumessi, Samuel Ofosu Mensah).
80
+
81
+ Oesterle, J., Ran, Y., Stahr, P., Kerr, J. N. D., Schubert, T., Berens, P., & Euler, T. (2025). Task-specific regional circuit adaptations in distinct mouse retinal ganglion cells. Science Advances, 11(17). https://doi.org/10.1126/sciadv.adp7075
82
+ Collaboration: All (Philipp Berens) and Neural (Jonathan Oesterle).
83
+ ```
84
+
85
+
86
+ ## Installation
87
+
88
+ ```bash
89
+ uv pip install cooplot
90
+ ```
91
+
92
+ or
93
+
94
+ ```bash
95
+ git clone git@github.com:berenslab/cooplot.git
96
+ cd cooplot
97
+ uv pip install -e ".[dev]"
98
+ ```
99
+
100
+ ## Usage
101
+
102
+ See the [example notebook](https://github.com/berenslab/cooplot/blob/main/notebooks/agberens.ipynb) for a complete usage example.
@@ -0,0 +1 @@
1
+ from .api import *
@@ -0,0 +1,295 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from collections import defaultdict
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Dict, Iterable, List, Optional, Tuple
9
+
10
+ DEFAULT_CACHE_DIR = Path(".cache/cooplot/groups")
11
+ _UNLABELED = "Unlabeled"
12
+ _slug_pattern = re.compile(r"[^A-Za-z0-9._-]+")
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class GroupedPublications:
17
+ """Container for group-level publication data."""
18
+
19
+ by_group: Dict[str, List[dict]]
20
+ paths: Dict[str, Path]
21
+
22
+ def sorted_groups(self) -> List[str]:
23
+ return sorted(self.by_group.keys(), key=str.lower)
24
+
25
+ def to_author_list(self, name_field: str = "name") -> List[dict]:
26
+ """Return a minimal people list that can be fed into ``build``."""
27
+
28
+ records: List[dict] = []
29
+ for group in self.sorted_groups():
30
+ records.append({name_field: group, "group": group})
31
+ return records
32
+
33
+ def exclude_groups(self, groups: Iterable[str]) -> "GroupedPublications":
34
+ """Return a new instance without the specified groups."""
35
+
36
+ to_remove = {str(group).strip() for group in groups if group is not None}
37
+ to_remove.discard("")
38
+ if not to_remove:
39
+ return self
40
+
41
+ filtered_by_group = {
42
+ group: pubs
43
+ for group, pubs in self.by_group.items()
44
+ if group not in to_remove
45
+ }
46
+ filtered_paths = {
47
+ group: path for group, path in self.paths.items() if group not in to_remove
48
+ }
49
+ return GroupedPublications(by_group=filtered_by_group, paths=filtered_paths)
50
+
51
+
52
+ def _lastname(name: str) -> str:
53
+ return (name or "").strip().split()[-1].lower()
54
+
55
+
56
+ def _normalize_group(value: Optional[str]) -> str:
57
+ if value is None:
58
+ return _UNLABELED
59
+ group = str(value).strip()
60
+ return group if group else _UNLABELED
61
+
62
+
63
+ def _slugify(label: str) -> str:
64
+ slug = _slug_pattern.sub("_", label.strip())
65
+ slug = slug.strip("._")
66
+ return slug or "group"
67
+
68
+
69
+ def _publication_key(publication: dict) -> Optional[Tuple[str, Optional[int]]]:
70
+ norm_title = (publication.get("norm_title") or "").strip()
71
+ if not norm_title:
72
+ return None
73
+ year = publication.get("year")
74
+ if isinstance(year, int):
75
+ return norm_title, year
76
+ return norm_title, None
77
+
78
+
79
+ def _format_record(record: dict) -> dict:
80
+ authors = sorted(record["authors"], key=_lastname)
81
+ return {
82
+ "title": record["title"],
83
+ "norm_title": record["norm_title"],
84
+ "year": record["year"],
85
+ "authors": authors,
86
+ }
87
+
88
+
89
+ def _publication_sort_key(record: dict) -> Tuple[int, str]:
90
+ year = record.get("year")
91
+ norm_title = (record.get("norm_title") or record.get("title") or "").lower()
92
+ year_key = year if isinstance(year, int) else -1
93
+ return (year_key, norm_title)
94
+
95
+
96
+ def aggregate_publications(
97
+ publications_by_author: Dict[str, List[dict]],
98
+ people: Iterable[dict],
99
+ *,
100
+ name_col: str = "name",
101
+ group_col: str = "group",
102
+ cache_dir: Path | str = DEFAULT_CACHE_DIR,
103
+ include_unlabeled: bool = True,
104
+ save_json: bool = True,
105
+ ensure_ascii: bool = False,
106
+ ) -> GroupedPublications:
107
+ """Group publications by ``group_col`` and deduplicate by normalized title.
108
+
109
+ Parameters
110
+ ----------
111
+ publications_by_author
112
+ Mapping of author name to list of publication dicts, as returned by
113
+ :func:`cooplot.scrape.scrape_all`.
114
+ people
115
+ Iterable of records describing each author. ``group_col`` is used to map
116
+ authors onto the aggregation key.
117
+ name_col
118
+ Field name within ``people`` entries identifying each author.
119
+ group_col
120
+ Field name used to determine group membership.
121
+ cache_dir
122
+ Directory where group-level JSON files will be stored if ``save_json`` is
123
+ ``True``. The directory is created if needed.
124
+ include_unlabeled
125
+ Whether authors without a ``group_col`` value should be collected under an
126
+ ``"Unlabeled"`` bucket.
127
+ save_json
128
+ When ``True`` the grouped publication lists are written to individual
129
+ JSON files under ``cache_dir``.
130
+ ensure_ascii
131
+ Passed through to :func:`json.dumps` so callers can enforce ASCII-only
132
+ output if desired.
133
+
134
+ Returns
135
+ -------
136
+ GroupedPublications
137
+ Dataclass containing the grouped publication mapping and the optional
138
+ cache file paths (empty when ``save_json`` is ``False``).
139
+ """
140
+
141
+ name_to_group: Dict[str, str] = {}
142
+ for person in people:
143
+ name_value = person.get(name_col)
144
+ if not isinstance(name_value, str) or not name_value.strip():
145
+ continue
146
+ group_value = _normalize_group(person.get(group_col))
147
+ name_to_group[name_value] = group_value
148
+
149
+ grouped: Dict[str, Dict[Tuple[str, Optional[int]], dict]] = {}
150
+
151
+ for author, publications in publications_by_author.items():
152
+ group_label = name_to_group.get(author, _UNLABELED)
153
+ if group_label == _UNLABELED and not include_unlabeled:
154
+ continue
155
+
156
+ bucket = grouped.setdefault(group_label, {})
157
+ for publication in publications:
158
+ key = _publication_key(publication)
159
+ if key is None:
160
+ continue
161
+ norm_title, year = key
162
+ title = publication.get("title") or norm_title
163
+ entry = bucket.setdefault(
164
+ key,
165
+ {
166
+ "title": title,
167
+ "norm_title": norm_title,
168
+ "year": year,
169
+ "authors": set(),
170
+ },
171
+ )
172
+ if entry["title"] == entry["norm_title"] and publication.get("title"):
173
+ entry["title"] = publication["title"]
174
+ entry["authors"].add(author)
175
+
176
+ grouped_lists: Dict[str, List[dict]] = {}
177
+ for group_label, records in grouped.items():
178
+ formatted_records = [_format_record(rec) for rec in records.values()]
179
+ formatted_records.sort(key=_publication_sort_key)
180
+ grouped_lists[group_label] = formatted_records
181
+
182
+ paths: Dict[str, Path] = {}
183
+ if save_json:
184
+ cache_path = Path(cache_dir)
185
+ cache_path.mkdir(parents=True, exist_ok=True)
186
+ for group_label, records in grouped_lists.items():
187
+ filename = f"{_slugify(group_label)}.json"
188
+ out_path = cache_path / filename
189
+ out_path.write_text(
190
+ json.dumps(records, ensure_ascii=ensure_ascii, indent=2),
191
+ encoding="utf-8",
192
+ )
193
+ paths[group_label] = out_path
194
+
195
+ return GroupedPublications(by_group=grouped_lists, paths=paths)
196
+
197
+
198
+ def aggregate_cross_group_data(
199
+ records: Iterable[dict],
200
+ *,
201
+ filter: Optional[str] = None,
202
+ ) -> GroupedPublications:
203
+ """Construct a :class:`GroupedPublications` from cross-group collaboration records.
204
+
205
+ Parameters
206
+ ----------
207
+ records
208
+ Iterable of dicts as returned by :func:`cooplot.metrics.cross_group_publications`
209
+ (or compatible structure) where each record contains ``title``, ``norm_title``,
210
+ ``year``, ``groups`` and ``authors`` entries.
211
+ filter
212
+ Optional string selecting records that contain identifiers. Supported values are
213
+ ``"doi"``, ``"pubmed"`` (or ``"pubmed_id"`` / ``"pmid"``), and ``"identifier"``
214
+ (alias ``"any"``) which keeps entries having either DOI or PubMed identifiers.
215
+ When ``None`` (default) no filtering is applied.
216
+ """
217
+
218
+ filter_normalized = (filter or "").strip().lower()
219
+ if filter_normalized and filter_normalized not in {
220
+ "doi",
221
+ "pubmed",
222
+ "pubmed_id",
223
+ "pmid",
224
+ "identifier",
225
+ "any",
226
+ }:
227
+ raise ValueError(
228
+ "filter must be one of None, 'doi', 'pubmed', 'pubmed_id', 'pmid', "
229
+ "'identifier', or 'any'",
230
+ )
231
+
232
+ def _has_doi(record: dict) -> bool:
233
+ doi = record.get("doi") or record.get("DOI")
234
+ return bool(isinstance(doi, str) and doi.strip())
235
+
236
+ def _has_pubmed(record: dict) -> bool:
237
+ pmid = record.get("pubmed_id") or record.get("pmid")
238
+ return bool(isinstance(pmid, str) and pmid.strip())
239
+
240
+ def _passes_filter(record: dict) -> bool:
241
+ if not filter_normalized:
242
+ return True
243
+ if filter_normalized == "doi":
244
+ return _has_doi(record)
245
+ if filter_normalized in {"pubmed", "pubmed_id", "pmid"}:
246
+ return _has_pubmed(record)
247
+ if filter_normalized in {"identifier", "any"}:
248
+ return _has_doi(record) or _has_pubmed(record)
249
+ return True
250
+
251
+ by_group: Dict[str, List[dict]] = defaultdict(list)
252
+ for record in records:
253
+ if not isinstance(record, dict):
254
+ continue
255
+ if not _passes_filter(record):
256
+ continue
257
+ title = record.get("title")
258
+ norm_title = record.get("norm_title")
259
+ if not isinstance(norm_title, str) or not norm_title.strip():
260
+ continue
261
+ year = record.get("year")
262
+ authors_by_group = record.get("authors") or {}
263
+ groups = record.get("groups") or list(authors_by_group.keys())
264
+ if not groups:
265
+ continue
266
+ base = {
267
+ "title": title or norm_title,
268
+ "norm_title": norm_title,
269
+ "year": year if isinstance(year, int) else None,
270
+ }
271
+ for group in groups:
272
+ if not isinstance(group, str):
273
+ continue
274
+ group_name = group.strip()
275
+ if not group_name:
276
+ continue
277
+ authors = authors_by_group.get(group) or authors_by_group.get(
278
+ group_name, []
279
+ )
280
+ if not isinstance(authors, list):
281
+ authors = list(authors) # tolerate iterables/sets
282
+ filtered_authors = [
283
+ author
284
+ for author in authors
285
+ if isinstance(author, str) and author.strip()
286
+ ]
287
+ formatted = dict(base)
288
+ formatted["authors"] = sorted(filtered_authors, key=_lastname)
289
+ by_group[group_name].append(formatted)
290
+
291
+ grouped_lists = {
292
+ group: sorted(pubs, key=_publication_sort_key)
293
+ for group, pubs in by_group.items()
294
+ }
295
+ return GroupedPublications(by_group=grouped_lists, paths={})