collabnet 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
collabnet/__init__.py ADDED
File without changes
collabnet/analysis.py ADDED
@@ -0,0 +1,354 @@
1
+ # Create networks from publication records.
2
+ import json
3
+ from itertools import combinations
4
+ from pathlib import Path
5
+
6
+ import igraph as ig
7
+ import numpy as np
8
+ import pandas as pd
9
+ from tqdm import tqdm
10
+
11
+
12
+ def _json_default(o):
13
+ # Make common non-JSON types serializable
14
+ try:
15
+ if isinstance(o, (np.integer, np.floating, np.bool_)):
16
+ return o.item()
17
+ except Exception:
18
+ pass
19
+ if isinstance(o, (set, tuple)):
20
+ return list(o)
21
+ raise TypeError(f"Object of type {type(o).__name__} is not JSON serializable")
22
+
23
+
24
+ def _serialize_seq_attrs(seq):
25
+ # seq is g.vs or g.es
26
+ attrs = list(seq.attributes())
27
+ for a in attrs:
28
+ vals = seq[a]
29
+ # If any value is not a GraphML primitive, serialize the entire column to JSON strings
30
+ if any(not (v is None or isinstance(v, (bool, int, float, str))) for v in vals):
31
+ seq[a] = [
32
+ json.dumps(
33
+ v, default=_json_default, ensure_ascii=False, separators=(",", ":")
34
+ )
35
+ if v is not None
36
+ else None
37
+ for v in vals
38
+ ]
39
+
40
+
41
+ def _serialize_graph_attrs(g: ig.Graph):
42
+ for a in list(g.attributes()):
43
+ v = g[a]
44
+ if not (v is None or isinstance(v, (bool, int, float, str))):
45
+ g[a] = json.dumps(
46
+ v, default=_json_default, ensure_ascii=False, separators=(",", ":")
47
+ )
48
+
49
+
50
+ def write_graphml_with_json(g: ig.Graph, path: Path):
51
+ """Routine to JSON serialize entries in node or edge attributes."""
52
+ _serialize_seq_attrs(g.vs)
53
+ _serialize_seq_attrs(g.es)
54
+ _serialize_graph_attrs(g)
55
+ g.write_graphml(path.as_posix())
56
+
57
+
58
+ class CreateNetwork:
59
+ """Create networks for co-author and co-country analysis
60
+
61
+ Builds time-sliced networks from a dataframe of OpenAlex works (e.g., as
62
+ produced by TransformOA). A time window can be defined as (year - window, year),
63
+ e.g. (1960 - 5, 1960) -> (1955, 1960), to gather all entries for that range in
64
+ one network.
65
+
66
+ Expected dataframe columns:
67
+ - publication_year: int
68
+ - authorships: list of (author_id, [institution_ids]) for coauthor mode
69
+ - countries: list[str] of ISO 3166-1 alpha-2 codes for cocountry mode
70
+
71
+
72
+ :param dataframe: Source records to derive networks from.
73
+ :type dataframe: pandas.DataFrame
74
+ :param year_range: Inclusive (start_year, end_year) for filtering works.
75
+ :type year_range: tuple[int, int]
76
+ :param interval: Size of each time window in years. Defaults to 1.
77
+ :type interval: int, optional
78
+ :param out_path: Directory where network files/exports will be written.
79
+ :type out_path: pathlib.Path, optional
80
+ :param net_type: Network type to build: "coauthor" or "cocountry".
81
+ Defaults to "coauthor".
82
+ :type net_type: str, optional
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ dataframe: pd.DataFrame,
88
+ year_range: tuple,
89
+ interval: int = 1,
90
+ out_path: Path = Path("."),
91
+ net_type: str = "coauthor",
92
+ ):
93
+ """Init class."""
94
+ self.dataframe = dataframe
95
+ self.year_range = year_range
96
+ self.interval = interval
97
+ self.out_path = out_path
98
+ self.net_type = net_type
99
+
100
+ def run(self) -> str:
101
+ self._write_graphml()
102
+ return "Done"
103
+
104
+ def _create_edge(self, row: dict) -> list:
105
+ """Create all co-author edges for a single publication record.
106
+
107
+ Generates unordered author pairs (nC2) from the `authorships` field of the
108
+ input row. Any author entries with missing/None IDs are ignored. Single-
109
+ author publications yield no edges.
110
+
111
+ Expected input structure:
112
+ - row["authorships"]: list of (author_id, [institution_ids])
113
+
114
+ where author_id may be str or None.
115
+
116
+ Optional fields (copied into edge metadata if present):
117
+ - row["id"], row["doi"], row["title"], row["type"],
118
+ - row["publication_year"], row["countries"], row["topics"],
119
+ - row["primary_location"], row["referenced_works"], etc.
120
+
121
+ Returned edge format:
122
+ - A list of tuples (author_u, author_v, metadata)
123
+
124
+ where:
125
+ - author_u: str — source author ID
126
+ - author_v: str — target author ID
127
+ - metadata: dict — publication-level attributes carried from `row`
128
+
129
+ (contents depend on available keys)
130
+
131
+ Notes:
132
+ - Pairs are unique combinations (no self-pairs, order-independent).
133
+ - Entries with missing author IDs are skipped.
134
+
135
+ :param row: Publication record providing authorships and optional metadata.
136
+ :type row: dict | pandas.Series
137
+ :returns: List of co-author edges with metadata for the given publication.
138
+ :rtype: list[tuple[str, str, dict]]
139
+ :raises KeyError: If 'authorships' is missing from the input row.
140
+ :raises TypeError: If 'authorships' is not iterable.
141
+ """
142
+ list_entry = []
143
+ if self.net_type == "coauthor":
144
+ key = "authorships"
145
+ elif self.net_type == "cocountry":
146
+ key = "countries"
147
+ else:
148
+ raise KeyError("Please select net_type equals coauthor or cocountry.")
149
+ if len(row[key]) > 2:
150
+ pairs = combinations(row[key], 2)
151
+ for pair in pairs:
152
+ if None not in pair:
153
+ # Handle different data structure of author and country entries
154
+ if self.net_type == "coauthor":
155
+ src = pair[0][0]
156
+ trg = pair[1][0]
157
+ else:
158
+ src = pair[0]
159
+ trg = pair[1]
160
+ list_entry.append(
161
+ (
162
+ src,
163
+ trg,
164
+ 1,
165
+ row["id"],
166
+ row["title"],
167
+ int(row["publication_year"]),
168
+ row["topics"],
169
+ )
170
+ )
171
+ return list_entry
172
+ return []
173
+
174
+ def _write_graphml(self):
175
+ """Build rolling-window networks and export them to GraphML files.
176
+
177
+ For each year in the inclusive range [year_range[0], year_range[1]], this
178
+ method:
179
+
180
+ 1) Selects works with publication_year in [year - self.interval, year] (inclusive).
181
+ 2) Creates per-publication co-occurrence edges via `_create_edge(row)`.
182
+ 3) Flattens edges and aggregates by (source, target) to compute
183
+
184
+ - weight: number of co-occurrences in the window,
185
+ - paper: unique list of publication identifiers,
186
+ - years: unique list of publication years,
187
+ - topics: unique list aggregated across publications.
188
+
189
+ 4) Builds an igraph Graph and writes a GraphML file named f"{self.net_type}_{year}.graphml" to `self.out_path`.
190
+
191
+ Expected dataframe columns and behavior:
192
+ - publication_year: int; used for time-window filtering.
193
+ - `_create_edge(row)` must return a list of tuples with the following order per edge: (source, target, weight, paper, title, years, topics)
194
+
195
+ where:
196
+ - source, target: author or country IDs (str-like),
197
+ - weight: typically 1 per publication-level edge,
198
+ - paper: publication/work identifier (e.g., OpenAlex ID),
199
+ - title: publication title (not used in aggregation),
200
+ - years: publication year (int),
201
+ - topics: list of topic IDs.
202
+
203
+ Single-actor rows should yield no edges.
204
+
205
+ :returns: None
206
+ :rtype: None
207
+ :raises KeyError: If required dataframe columns are missing (e.g., publication_year)
208
+ or if `_create_edge` omits expected fields.
209
+ :raises ValueError: If edge tuples do not match the expected shape/order.
210
+ :raises ImportError: If igraph or required serialization utilities are missing.
211
+ :raises OSError: For I/O errors while writing GraphML files.
212
+ """
213
+ for year in tqdm(range(self.year_range[0], self.year_range[1] + 1, 1)):
214
+ entries = []
215
+ for idx, row in self.dataframe.query(
216
+ f"publication_year >= {year - self.interval} & publication_year <= {year}"
217
+ ).iterrows():
218
+ entries.append(self._create_edge(row))
219
+ edges = [x for y in entries for x in y]
220
+ dftemp = pd.DataFrame(
221
+ edges,
222
+ columns=[
223
+ "source",
224
+ "target",
225
+ "weight",
226
+ "paper",
227
+ "title",
228
+ "years",
229
+ "topics",
230
+ ],
231
+ )
232
+ weighted_edges = []
233
+ for idx, g0 in dftemp.groupby(["source", "target"]):
234
+ weighted_edges.append(
235
+ (
236
+ idx[0],
237
+ idx[1],
238
+ g0.shape[0],
239
+ list(g0.paper.unique()),
240
+ list(g0.years.unique()),
241
+ list(set([x for y in list(g0.topics.values) for x in y])),
242
+ )
243
+ )
244
+ graph = ig.Graph.TupleList(
245
+ weighted_edges, edge_attrs=["weight", "paper", "years", "topics"]
246
+ )
247
+ write_graphml_with_json(
248
+ graph, self.out_path / f"{self.net_type}_{year}.graphml"
249
+ )
250
+
251
+
252
+ class CalculateAICI:
253
+ """Calculate adjusted internationalization collaboration index.
254
+
255
+ Input dataframe generated by `collabnet.data.OpenAlex`.
256
+ """
257
+
258
+ def __init__(self, dataframe: pd.DataFrame) -> None:
259
+ """Init class."""
260
+ self.dataframe = dataframe
261
+
262
+ def _check_country_exist(self, row: dict) -> bool:
263
+ if row["countries"]:
264
+ return True
265
+ return False
266
+
267
+ def _check_no_country_exist(self, row: dict) -> bool:
268
+ if not row["countries"]:
269
+ return True
270
+ return False
271
+
272
+ def _check_is_international(self, row: dict) -> bool:
273
+ if row["countries"] and len(set(row["countries"])) > 1:
274
+ return True
275
+ else:
276
+ return False
277
+
278
+ def _generate_df(self, dataframe: pd.DataFrame) -> pd.DataFrame:
279
+ """Calculate AICI for the dataframe."""
280
+ article_per_year = dataframe.groupby("publication_year").size().to_dict()
281
+ affil_per_year = {}
282
+ no_affil_per_year = {}
283
+ is_international_per_year = {}
284
+ for idx, g0 in dataframe.groupby("publication_year"):
285
+ affil_per_year.update(
286
+ {
287
+ idx: int(
288
+ g0.apply(lambda x: self._check_country_exist(x), axis=1).sum()
289
+ )
290
+ }
291
+ )
292
+ no_affil_per_year.update(
293
+ {
294
+ idx: int(
295
+ g0.apply(
296
+ lambda x: self._check_no_country_exist(x), axis=1
297
+ ).sum()
298
+ )
299
+ }
300
+ )
301
+ is_international_per_year.update(
302
+ {
303
+ idx: int(
304
+ g0.apply(
305
+ lambda x: self._check_is_international(x), axis=1
306
+ ).sum()
307
+ )
308
+ }
309
+ )
310
+ aici_df = pd.DataFrame(
311
+ [
312
+ article_per_year,
313
+ affil_per_year,
314
+ no_affil_per_year,
315
+ is_international_per_year,
316
+ ]
317
+ ).T.rename(
318
+ columns={0: "papers", 1: "with_affil", 2: "no_affil", 3: "is_international"}
319
+ )
320
+ aici_df = aici_df.reset_index().rename(columns={"index": "year"})
321
+ return aici_df
322
+
323
+ def complete_df(self) -> pd.DataFrame:
324
+ """Calculate values for full dataframe."""
325
+ dftemp = self._generate_df(self.dataframe)
326
+ dftemp.insert(0, "level", "global")
327
+ return dftemp
328
+
329
+ def country_df(self, country: str) -> pd.DataFrame:
330
+ """Calculate values for one country."""
331
+ country_mask = self.dataframe.countries.apply(
332
+ lambda x: True if country in x else False
333
+ )
334
+ dataframe = self.dataframe[country_mask].reset_index(drop=True)
335
+ dftemp = self._generate_df(dataframe)
336
+ dftemp.insert(0, "level", country)
337
+ return dftemp
338
+
339
+ def country_compare(self, country_list: list) -> pd.DataFrame:
340
+ """Create data to compare country AICI."""
341
+ complete_df = self.complete_df()
342
+ aici_complete = complete_df["is_international"]/complete_df["with_affil"]
343
+ complete_df.insert(0, "aici", aici_complete)
344
+ norm_series = complete_df[["year", "with_affil"]].set_index("year")["with_affil"]
345
+ df_list = [complete_df]
346
+ for country in country_list:
347
+ country_df = self.country_df(country)
348
+ aici_cnty = country_df.set_index("year")["is_international"]/norm_series
349
+ country_df = country_df.merge(
350
+ aici_cnty.to_frame("aici"), left_on="year", right_index=True
351
+ )
352
+ df_list.append(country_df)
353
+ data = pd.concat(df_list, ignore_index=True)
354
+ return data
collabnet/data.py ADDED
@@ -0,0 +1,230 @@
1
+ # Transform source data to required format.
2
+ # Source data can come from OpenAlex by using the utilites module.
3
+
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ import pyalex
9
+ from tqdm import tqdm
10
+
11
+
12
+ class TransformOA:
13
+ """Read and transform Open Alex data.
14
+
15
+ File list should be a list of file paths.
16
+ Source files have to be in JSON format.
17
+
18
+ Using TransformOA.run() has the following workflow:
19
+ 1) Open the UTF-8 encoded JSON file.
20
+ 2) Load a list of OpenAlex Work records.
21
+ 3) Normalize each record via `_process_entry`.
22
+ 4) Create a pandas DataFrame and write it as JSON to `self.out_path`
23
+ using the same base filename.
24
+
25
+ Side effects:
26
+ - Writes a JSON file to `<self.out_path>/<file_path.name>`.
27
+
28
+ Note:
29
+ - UnicodeDecodeError from the JSON files are caught and logged.
30
+ - Other I/O and JSON parsing errors will propagate to the caller.
31
+
32
+ :param src_files: List of source file paths to JSON files containing OpenAlex records.
33
+ :type src_files: list[str] | list[pathlib.Path]
34
+ :param out_path: Directory where transformed outputs will be written. Defaults to the current directory.
35
+ :type out_path: str | pathlib.Path, optional
36
+ """
37
+
38
+ def __init__(self, src_files: list, out_path: str = "."):
39
+ """Init class."""
40
+ self.src_files = src_files
41
+ self.out_path = out_path
42
+
43
+ def _getAuthorAffilID(self, authors) -> tuple:
44
+ """Extract author and affiliation identifiers and associated country codes.
45
+
46
+ Expects the OpenAlex "authorships" structure:
47
+ - Each item should contain an "author" dict with an "id".
48
+ - Each item should contain an "institutions" list; each institution may have "id" and "country_code".
49
+
50
+ Returns a tuple:
51
+ - List of (author_id, [institution_ids]) pairs.
52
+ - List of country codes (one per institution encountered, duplicates possible).
53
+
54
+ Note: Missing keys are returned as empty dicts ({}).
55
+
56
+ :param authors: List of authorship entries from an OpenAlex Work.
57
+ :type authors: list[dict]
58
+ :returns: (author_affils, countries)
59
+ where author_affils is list[tuple[str | dict, list[str | dict]]]
60
+ and countries is list[str | dict]
61
+ :rtype: tuple[list[tuple[str | dict, list[str | dict]]], list[str | dict]]
62
+ :raises TypeError: If `authors` is not iterable.
63
+ """
64
+ result = []
65
+ countries = []
66
+ for author in authors:
67
+ idx = author.get("author", {}).get("id", {})
68
+ instList = []
69
+ institutions = author.get("institutions")
70
+ for ins in institutions:
71
+ instList.append(ins.get("id", {}))
72
+ countries.append(ins.get("country_code", {}))
73
+ result.append((idx, instList))
74
+ return result, countries
75
+
76
+ def _getTopicID(self, topics) -> list:
77
+ """Collect topic identifiers from an OpenAlex Work.
78
+
79
+ Expects a list of topic dicts, each with an "id" key.
80
+
81
+ Note: Missing keys are returned as empty dicts ({}).
82
+
83
+ :param topics: Topics section from an OpenAlex Work.
84
+ :type topics: list[dict]
85
+ :returns: List of topic IDs.
86
+ :rtype: list[str | dict]
87
+ :raises TypeError: If `topics` is not iterable.
88
+ """
89
+ result = []
90
+ for topic in topics:
91
+ result.append(topic.get("id", {}))
92
+ return result
93
+
94
+ def _getJournalID(self, publication_location) -> str | None:
95
+ """Return the source (journal) identifier from a Work's primary location.
96
+
97
+ Expects the "primary_location" structure:
98
+ - A dict containing a "source" dict with an "id" key.
99
+
100
+ :param publication_location: The Work's primary_location object.
101
+ :type publication_location: dict
102
+ :returns: Source ID if available; otherwise None.
103
+ :rtype: str | None
104
+ """
105
+ source = publication_location.get("source", {})
106
+ if source:
107
+ return source.get("id", {})
108
+ return None
109
+
110
+ def _process_entry(self, work: dict) -> dict:
111
+ """Normalize a single OpenAlex Work record.
112
+
113
+ Extracts a subset of fields from the raw work, reconstructs the abstract
114
+ from the abstract_inverted_index, replaces it with a plain-text abstract,
115
+ and enriches the record with processed authorships, countries, topics, and
116
+ primary_location identifiers.
117
+
118
+ Required input keys in `work` for transformations:
119
+ - abstract_inverted_index
120
+ - authorships
121
+ - topics
122
+ - primary_location
123
+
124
+ Output keys in the returned dict:
125
+ - abstract (reconstructed from abstract_inverted_index)
126
+ - authorships (processed via _getAuthorAffilID)
127
+ - countries (derived from authors' affiliations)
128
+ - topics (processed via _getTopicID)
129
+ - primary_location (processed via _getJournalID)
130
+
131
+ The keys id, doi, title, type, publication_year and referenced_works are copied.
132
+
133
+ :param work: Raw OpenAlex Work object as returned by the API.
134
+ :type work: dict
135
+ :returns: Normalized work record with selected fields and derived values.
136
+ :rtype: dict
137
+ :raises TypeError: If `work` is not a mapping-like object.
138
+ :raises KeyError: If any required field is missing from `work`.
139
+ """
140
+ data_keys = [
141
+ "doi",
142
+ "title",
143
+ "type",
144
+ "publication_year",
145
+ "abstract_inverted_index",
146
+ "referenced_works",
147
+ ]
148
+ result = dict()
149
+ result = {"id": work["id"]}
150
+ for key in data_keys:
151
+ result.update({key: work[key]})
152
+ tempWork = pyalex.Work(result)
153
+ abstract = tempWork["abstract"]
154
+ result.update({"abstract": abstract})
155
+ result.pop("abstract_inverted_index", None)
156
+ authors, countries = self._getAuthorAffilID(work["authorships"])
157
+ result.update({"authorships": authors})
158
+ result.update({"countries": countries})
159
+ result.update({"topics": self._getTopicID(work["topics"])})
160
+ result.update(
161
+ {"primary_location": self._getJournalID(work["primary_location"])}
162
+ )
163
+ return result
164
+
165
+ def _process_file(self, file_path: Path) -> None:
166
+ """Transform one source JSON file into the normalized output format.
167
+
168
+ :param file_path: Path to a UTF-8 encoded JSON file containing a list of
169
+ OpenAlex Work objects.
170
+ :type file_path: pathlib.Path
171
+ :returns: None
172
+ :rtype: None
173
+ :raises FileNotFoundError: If the input file does not exist.
174
+ :raises json.JSONDecodeError: If the file content is not valid JSON.
175
+ :raises OSError: For general I/O errors while reading or writing.
176
+ """
177
+ with open(Path(file_path), encoding="utf-8") as infile:
178
+ try:
179
+ data = json.load(infile)
180
+ result = [self._process_entry(work) for work in data]
181
+ dftemp = pd.DataFrame(result)
182
+ dftemp.to_json(f"{self.out_path}{file_path.name}")
183
+ except UnicodeDecodeError as r:
184
+ print("error in file", r)
185
+
186
+ def run(self) -> None:
187
+ """Process all source files defined in `self.src_files`.
188
+
189
+ Iterates over `self.src_files` with a progress bar and invokes
190
+ `_process_file` for each entry.
191
+
192
+ :returns: None
193
+ :rtype: None
194
+ """
195
+ for file_path in tqdm(self.src_files):
196
+ self._process_file(file_path)
197
+
198
+
199
+ def join_to_df(
200
+ src_folder: Path, out_folder: Path = Path("."), selection: str = "*.json"
201
+ ) -> pd.DataFrame:
202
+ """Create combined dataframe from JSON files.
203
+ Entries are de-duplicated based on their work ID.
204
+
205
+ If only specific JSON files should be joined,
206
+ specify a regex pattern like 'works_s\*.json'.
207
+ Supports glob to search files in subfolders, e.g.
208
+ '\**/\*.json' for all JSON files in all subfolders.
209
+
210
+ :param src_folder: Root directory to search for input JSON files.
211
+ :type src_folder: pathlib.Path
212
+ :param out_folder: Directory where any outputs or artifacts may be written.
213
+ Defaults to the current directory.
214
+ :type out_folder: pathlib.Path, optional
215
+ :param selection: Glob pattern used to select input JSON files relative to
216
+ src_folder. Defaults to '\*.json'.
217
+ :type selection: str, optional
218
+ :returns: Combined dataframe of unique works from the selected JSON files.
219
+ :rtype: pandas.DataFrame
220
+ """
221
+ file_paths = list(src_folder.glob(selection))
222
+ df_list = []
223
+ for filepath in file_paths:
224
+ data = pd.read_json(filepath)
225
+ df_list.append(data)
226
+ data = pd.concat(df_list, ignore_index=True)
227
+ dedup_data = data.drop_duplicates(subset="id")
228
+ dedup_data.to_json(out_folder / "joined_data.json", lines=True, orient="records")
229
+ print(f"Deduplicated {data.shape[0] - dedup_data.shape[0]} entries.")
230
+ return dedup_data
collabnet/utils.py ADDED
@@ -0,0 +1,165 @@
1
+ # Query OA for works in relation to a journal or topic
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import pyalex
7
+ from tqdm import tqdm
8
+ from validator_collection import validators
9
+
10
+ pyalex.config.max_retries = 1
11
+ pyalex.config.retry_backoff_factor = 0.1
12
+ pyalex.config.retry_http_codes = [429, 500, 503]
13
+
14
+
15
+ class QueryOA:
16
+ """Query Open Alex to receive publication records.
17
+
18
+ :param config_email: Contact email used for OpenAlex polite API requests.
19
+ :type config_email: str
20
+ :param query_list: List of query terms or identifiers to query against OpenAlex.
21
+ :type query_list: list
22
+ :param year_range: Inclusive (start_year, end_year) for filtering publication years.
23
+ :type year_range: tuple[int, int]
24
+ :param out_path: Directory where results/artifacts should be written. Defaults to the current directory.
25
+ :type out_path: pathlib.Path, optional
26
+ :param query_type: Type of query to perform ("journal", "topic", or "institution"). Defaults to "topic".
27
+ :type query_type: str, optional
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ config_email: str,
33
+ query_list: list,
34
+ year_range: tuple,
35
+ out_path: Path = Path("."),
36
+ query_type: str = "topic",
37
+ n_max: int | None = None,
38
+ ):
39
+ assert query_type in ["topic", "journal", "institution"], (
40
+ "Use supported query type: topic, journal or institution"
41
+ )
42
+ if query_type == "topic":
43
+ assert all([x.startswith("t") for x in query_list]), (
44
+ "Provide list of correct topic IDs starting with t."
45
+ )
46
+ elif query_type == "journal":
47
+ assert all([x.startswith("s") for x in query_list]), (
48
+ "Provide list of correct journal IDs starting with s."
49
+ )
50
+ else:
51
+ pass
52
+ assert validators.email(config_email), "Provide a valid email address."
53
+ pyalex.config.email = config_email
54
+ self.query_list = query_list
55
+ self.query_type = query_type
56
+ self.year_range = year_range
57
+ self.out_path = out_path
58
+ self.n_max = n_max
59
+
60
+ def run(self) -> list:
61
+ file_list = self._run_query(
62
+ self.query_list, self.query_type, self.year_range, self.out_path
63
+ )
64
+ return file_list
65
+
66
+ def _run_query(self, query_list, query_type, year_range, out_path) -> list:
67
+ """Execute an OpenAlex query and collect publication records.
68
+
69
+ :param query_list: List of query terms or identifiers to query against OpenAlex.
70
+ :type query_list: list
71
+ :param query_type: Type of query to perform (e.g., "topic", "author", "source").
72
+ :type query_type: str
73
+ :param year_range: Inclusive (start_year, end_year) for filtering publication years.
74
+ :type year_range: tuple[int, int]
75
+ :param out_path: Directory where results or intermediate artifacts may be written.
76
+ :type out_path: pathlib.Path
77
+ :returns: Retrieved publication records.
78
+ :rtype: list[dict]
79
+ """
80
+ generated_files = []
81
+ for entry in tqdm(query_list):
82
+ all_current_works = []
83
+ if query_type == "topic":
84
+ temp_works = self._topic_query(entry, year_range)
85
+ elif query_type == "journal":
86
+ temp_works = self._journal_query(entry, year_range)
87
+ elif query_type == "institution":
88
+ temp_works = self._affiliation_query(entry, year_range)
89
+ else:
90
+ raise TypeError("Provide query_type.")
91
+ for res_temp_works in temp_works:
92
+ all_current_works.append(res_temp_works)
93
+ with open(
94
+ Path(out_path / f"works_{entry}.json"), "w", encoding="utf8"
95
+ ) as json_file:
96
+ json.dump(
97
+ [x for y in all_current_works for x in y],
98
+ json_file,
99
+ ensure_ascii=False,
100
+ )
101
+ generated_files.append(Path(out_path / f"works_{entry}.json"))
102
+ return generated_files
103
+
104
+ def _topic_query(self, entry: str, year_range: tuple):
105
+ """Run a topic-based query against OpenAlex for a single entry.
106
+
107
+ :param entry: Topic identifier to query (topic ids start with the letter t).
108
+ :type entry: str
109
+ :param year_range: Inclusive (start_year, end_year) range for filtering publication years.
110
+ :type year_range: tuple[int, int]
111
+ :returns: Publication records matching the topic and year filter.
112
+ :rtype: list[dict]
113
+ """
114
+ temp_works = (
115
+ pyalex.Works()
116
+ .filter(
117
+ topics={"id": entry},
118
+ from_publication_date=f"{year_range[0]}-01-01",
119
+ to_publication_date=f"{year_range[1]}-12-31",
120
+ )
121
+ .paginate(per_page=200, n_max=self.n_max)
122
+ )
123
+ return temp_works
124
+
125
+ def _journal_query(self, entry: str, year_range: tuple):
126
+ """Run a journal-based query against OpenAlex for a single entry.
127
+
128
+ :param entry: Journal identifier to query (journal ids start with the letter s).
129
+ :type entry: str
130
+ :param year_range: Inclusive (start_year, end_year) range for filtering publication years.
131
+ :type year_range: tuple[int, int]
132
+ :returns: Publication records matching the topic and year filter.
133
+ :rtype: list[dict]
134
+ """
135
+ temp_works = (
136
+ pyalex.Works()
137
+ .filter(
138
+ primary_location={"source": {"id": entry}},
139
+ from_publication_date=f"{year_range[0]}-01-01",
140
+ to_publication_date=f"{year_range[1]}-12-31",
141
+ )
142
+ .paginate(per_page=200, n_max=self.n_max)
143
+ )
144
+ return temp_works
145
+
146
+ def _affiliation_query(self, entry: str, year_range: tuple):
147
+ """Run a institution-based query against OpenAlex for a single entry.
148
+
149
+ :param entry: ROR ID to query (See ror.org for search options).
150
+ :type entry: str
151
+ :param year_range: Inclusive (start_year, end_year) range for filtering publication years.
152
+ :type year_range: tuple[int, int]
153
+ :returns: Publication records matching the topic and year filter.
154
+ :rtype: list[dict]
155
+ """
156
+ temp_works = (
157
+ pyalex.Works()
158
+ .filter(
159
+ authorships={"institutions": {"ror": entry}},
160
+ from_publication_date=f"{year_range[0]}-01-01",
161
+ to_publication_date=f"{year_range[1]}-12-31",
162
+ )
163
+ .paginate(per_page=200, n_max=self.n_max)
164
+ )
165
+ return temp_works
@@ -0,0 +1,84 @@
1
+ Metadata-Version: 2.3
2
+ Name: collabnet
3
+ Version: 0.1.0
4
+ Summary: Build collaboration networks from publication records.
5
+ Author: Malte Vogl
6
+ Author-email: Malte Vogl <vogl@gea.mpg.de>
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Dist: pyalex
11
+ Requires-Dist: pandas
12
+ Requires-Dist: igraph
13
+ Requires-Dist: tqdm
14
+ Requires-Dist: matplotlib
15
+ Requires-Dist: validator-collection>=1.5.0
16
+ Requires-Python: >=3.12
17
+ Project-URL: Bug Tracker, https://gitlab.gwdg.de/mpigea/dt/collabnet/-/issues
18
+ Project-URL: Homepage, https://gitlab.gwdg.de/mpigea/dt/collabnet
19
+ Project-URL: Project Homepage, https://modelsen.gea.mpg.de
20
+ Description-Content-Type: text/markdown
21
+
22
+ # CollabNet
23
+
24
+ [![Documentation Status](https://readthedocs.org/projects/collabnet/badge/?version=latest)](https://collabnet.readthedocs.io/en/latest/?badge=latest)
25
+
26
+ This Python package allows to generate co-authorship and co-country networks.
27
+ The source data (currently from OpenAlex) can be works for the same journal,
28
+ institution or topic. The data is transformed into a suitable format to facilitate
29
+ the network creation. Networks are generated and saved as GRAPHML files.
30
+
31
+ In addition the app allows to calculate the Adjusted International Collaboration Index,
32
+ as defined in Lalli 2025.
33
+
34
+ ![Gephi Lite plot of Co-authorship network for 1980 with interval 3](docs/_static/gephi_lite_coauthors-1980-interval-3.png)
35
+
36
+ [Gephi Lite](https://lite.gephi.org) plot of Co-authorship network for 1980 with interval 3 with 738 nodes and 2402 edges.
37
+ Layout in ForceAtlas2 design, colors related to Louvain modularity class. Node size by degree. The selected node with label has
38
+ degree 16.
39
+
40
+ Documentation is available on [ReadTheDocs](https://collabnet.readthedocs.io).
41
+
42
+ ## Installation
43
+
44
+ tl;dr Use pip
45
+
46
+ ~~~bash
47
+ pip install collabnet
48
+ ~~~
49
+
50
+ Consider using a clean virtual environment to keep your main packages separated.
51
+ Create a new virtual environment and install the package
52
+
53
+ ~~~bash
54
+ python3 -m venv env
55
+ source env/bin/activate
56
+ pip install collabnet
57
+ ~~~
58
+
59
+ ## Example
60
+
61
+ You can find an example Jupyter Notebook showing the use of package by demonstrating a query for data, transforming the data, and generate networks in the [examples folder](../example/collab_pipeline.ipynb).
62
+
63
+ ## Testing
64
+
65
+ Tests can be run by running
66
+
67
+ ~~~bash
68
+ uv run pytest --cov-report=term-missing --cov=src/collabnet
69
+ ~~~
70
+ which installs the `test` dependency group and runs the files. Coverage of testing is shown.
71
+
72
+ ## Building documentation
73
+
74
+ The documentation is build using _sphinx_. Install of the _doc_ dependency group is automated with `uv`:
75
+
76
+ ~~~bash
77
+ uv run sphinx-build -c docs -b html docs docs/_build/html
78
+ ~~~
79
+
80
+ ## Funding information
81
+
82
+ This work is part of a collaboration between the department for
83
+ Structural Changes of the Technosphere, Max Planck Institute of Geoanthropology, Jena, Germany and
84
+ DIMEAS - Department of Mechanical and Aerospace Engineering, Politecnico di Torino, Torino, Italy.
@@ -0,0 +1,7 @@
1
+ collabnet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ collabnet/analysis.py,sha256=4IzlcoSOA6WuO_IxukbAHrpPjSxkKaYFiaj9RV577Hw,13322
3
+ collabnet/data.py,sha256=rWq2CGF5lHRheDtrDsjflfMpy7u-PRMEJiflTYHnPk4,9201
4
+ collabnet/utils.py,sha256=qPL2g3YkhtASsX-fc6fgkkLBbzvDZSnzAizLKVhM-30,6598
5
+ collabnet-0.1.0.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
6
+ collabnet-0.1.0.dist-info/METADATA,sha256=rpmQ5vgrPfO9Z_nSSg2gbVT54Mm-_-KIj7S5KYe-WyU,3010
7
+ collabnet-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.17
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any