collabnet 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- collabnet/__init__.py +0 -0
- collabnet/analysis.py +354 -0
- collabnet/data.py +230 -0
- collabnet/utils.py +165 -0
- collabnet-0.1.0.dist-info/METADATA +84 -0
- collabnet-0.1.0.dist-info/RECORD +7 -0
- collabnet-0.1.0.dist-info/WHEEL +4 -0
collabnet/__init__.py
ADDED
|
File without changes
|
collabnet/analysis.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
# Create networks from publication records.
|
|
2
|
+
import json
|
|
3
|
+
from itertools import combinations
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import igraph as ig
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _json_default(o):
|
|
13
|
+
# Make common non-JSON types serializable
|
|
14
|
+
try:
|
|
15
|
+
if isinstance(o, (np.integer, np.floating, np.bool_)):
|
|
16
|
+
return o.item()
|
|
17
|
+
except Exception:
|
|
18
|
+
pass
|
|
19
|
+
if isinstance(o, (set, tuple)):
|
|
20
|
+
return list(o)
|
|
21
|
+
raise TypeError(f"Object of type {type(o).__name__} is not JSON serializable")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _serialize_seq_attrs(seq):
|
|
25
|
+
# seq is g.vs or g.es
|
|
26
|
+
attrs = list(seq.attributes())
|
|
27
|
+
for a in attrs:
|
|
28
|
+
vals = seq[a]
|
|
29
|
+
# If any value is not a GraphML primitive, serialize the entire column to JSON strings
|
|
30
|
+
if any(not (v is None or isinstance(v, (bool, int, float, str))) for v in vals):
|
|
31
|
+
seq[a] = [
|
|
32
|
+
json.dumps(
|
|
33
|
+
v, default=_json_default, ensure_ascii=False, separators=(",", ":")
|
|
34
|
+
)
|
|
35
|
+
if v is not None
|
|
36
|
+
else None
|
|
37
|
+
for v in vals
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _serialize_graph_attrs(g: ig.Graph):
|
|
42
|
+
for a in list(g.attributes()):
|
|
43
|
+
v = g[a]
|
|
44
|
+
if not (v is None or isinstance(v, (bool, int, float, str))):
|
|
45
|
+
g[a] = json.dumps(
|
|
46
|
+
v, default=_json_default, ensure_ascii=False, separators=(",", ":")
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def write_graphml_with_json(g: ig.Graph, path: Path):
|
|
51
|
+
"""Routine to JSON serialize entries in node or edge attributes."""
|
|
52
|
+
_serialize_seq_attrs(g.vs)
|
|
53
|
+
_serialize_seq_attrs(g.es)
|
|
54
|
+
_serialize_graph_attrs(g)
|
|
55
|
+
g.write_graphml(path.as_posix())
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class CreateNetwork:
|
|
59
|
+
"""Create networks for co-author and co-country analysis
|
|
60
|
+
|
|
61
|
+
Builds time-sliced networks from a dataframe of OpenAlex works (e.g., as
|
|
62
|
+
produced by TransformOA). A time window can be defined as (year - window, year),
|
|
63
|
+
e.g. (1960 - 5, 1960) -> (1955, 1960), to gather all entries for that range in
|
|
64
|
+
one network.
|
|
65
|
+
|
|
66
|
+
Expected dataframe columns:
|
|
67
|
+
- publication_year: int
|
|
68
|
+
- authorships: list of (author_id, [institution_ids]) for coauthor mode
|
|
69
|
+
- countries: list[str] of ISO 3166-1 alpha-2 codes for cocountry mode
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
:param dataframe: Source records to derive networks from.
|
|
73
|
+
:type dataframe: pandas.DataFrame
|
|
74
|
+
:param year_range: Inclusive (start_year, end_year) for filtering works.
|
|
75
|
+
:type year_range: tuple[int, int]
|
|
76
|
+
:param interval: Size of each time window in years. Defaults to 1.
|
|
77
|
+
:type interval: int, optional
|
|
78
|
+
:param out_path: Directory where network files/exports will be written.
|
|
79
|
+
:type out_path: pathlib.Path, optional
|
|
80
|
+
:param net_type: Network type to build: "coauthor" or "cocountry".
|
|
81
|
+
Defaults to "coauthor".
|
|
82
|
+
:type net_type: str, optional
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
dataframe: pd.DataFrame,
|
|
88
|
+
year_range: tuple,
|
|
89
|
+
interval: int = 1,
|
|
90
|
+
out_path: Path = Path("."),
|
|
91
|
+
net_type: str = "coauthor",
|
|
92
|
+
):
|
|
93
|
+
"""Init class."""
|
|
94
|
+
self.dataframe = dataframe
|
|
95
|
+
self.year_range = year_range
|
|
96
|
+
self.interval = interval
|
|
97
|
+
self.out_path = out_path
|
|
98
|
+
self.net_type = net_type
|
|
99
|
+
|
|
100
|
+
def run(self) -> str:
|
|
101
|
+
self._write_graphml()
|
|
102
|
+
return "Done"
|
|
103
|
+
|
|
104
|
+
def _create_edge(self, row: dict) -> list:
|
|
105
|
+
"""Create all co-author edges for a single publication record.
|
|
106
|
+
|
|
107
|
+
Generates unordered author pairs (nC2) from the `authorships` field of the
|
|
108
|
+
input row. Any author entries with missing/None IDs are ignored. Single-
|
|
109
|
+
author publications yield no edges.
|
|
110
|
+
|
|
111
|
+
Expected input structure:
|
|
112
|
+
- row["authorships"]: list of (author_id, [institution_ids])
|
|
113
|
+
|
|
114
|
+
where author_id may be str or None.
|
|
115
|
+
|
|
116
|
+
Optional fields (copied into edge metadata if present):
|
|
117
|
+
- row["id"], row["doi"], row["title"], row["type"],
|
|
118
|
+
- row["publication_year"], row["countries"], row["topics"],
|
|
119
|
+
- row["primary_location"], row["referenced_works"], etc.
|
|
120
|
+
|
|
121
|
+
Returned edge format:
|
|
122
|
+
- A list of tuples (author_u, author_v, metadata)
|
|
123
|
+
|
|
124
|
+
where:
|
|
125
|
+
- author_u: str — source author ID
|
|
126
|
+
- author_v: str — target author ID
|
|
127
|
+
- metadata: dict — publication-level attributes carried from `row`
|
|
128
|
+
|
|
129
|
+
(contents depend on available keys)
|
|
130
|
+
|
|
131
|
+
Notes:
|
|
132
|
+
- Pairs are unique combinations (no self-pairs, order-independent).
|
|
133
|
+
- Entries with missing author IDs are skipped.
|
|
134
|
+
|
|
135
|
+
:param row: Publication record providing authorships and optional metadata.
|
|
136
|
+
:type row: dict | pandas.Series
|
|
137
|
+
:returns: List of co-author edges with metadata for the given publication.
|
|
138
|
+
:rtype: list[tuple[str, str, dict]]
|
|
139
|
+
:raises KeyError: If 'authorships' is missing from the input row.
|
|
140
|
+
:raises TypeError: If 'authorships' is not iterable.
|
|
141
|
+
"""
|
|
142
|
+
list_entry = []
|
|
143
|
+
if self.net_type == "coauthor":
|
|
144
|
+
key = "authorships"
|
|
145
|
+
elif self.net_type == "cocountry":
|
|
146
|
+
key = "countries"
|
|
147
|
+
else:
|
|
148
|
+
raise KeyError("Please select net_type equals coauthor or cocountry.")
|
|
149
|
+
if len(row[key]) > 2:
|
|
150
|
+
pairs = combinations(row[key], 2)
|
|
151
|
+
for pair in pairs:
|
|
152
|
+
if None not in pair:
|
|
153
|
+
# Handle different data structure of author and country entries
|
|
154
|
+
if self.net_type == "coauthor":
|
|
155
|
+
src = pair[0][0]
|
|
156
|
+
trg = pair[1][0]
|
|
157
|
+
else:
|
|
158
|
+
src = pair[0]
|
|
159
|
+
trg = pair[1]
|
|
160
|
+
list_entry.append(
|
|
161
|
+
(
|
|
162
|
+
src,
|
|
163
|
+
trg,
|
|
164
|
+
1,
|
|
165
|
+
row["id"],
|
|
166
|
+
row["title"],
|
|
167
|
+
int(row["publication_year"]),
|
|
168
|
+
row["topics"],
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
return list_entry
|
|
172
|
+
return []
|
|
173
|
+
|
|
174
|
+
def _write_graphml(self):
|
|
175
|
+
"""Build rolling-window networks and export them to GraphML files.
|
|
176
|
+
|
|
177
|
+
For each year in the inclusive range [year_range[0], year_range[1]], this
|
|
178
|
+
method:
|
|
179
|
+
|
|
180
|
+
1) Selects works with publication_year in [year - self.interval, year] (inclusive).
|
|
181
|
+
2) Creates per-publication co-occurrence edges via `_create_edge(row)`.
|
|
182
|
+
3) Flattens edges and aggregates by (source, target) to compute
|
|
183
|
+
|
|
184
|
+
- weight: number of co-occurrences in the window,
|
|
185
|
+
- paper: unique list of publication identifiers,
|
|
186
|
+
- years: unique list of publication years,
|
|
187
|
+
- topics: unique list aggregated across publications.
|
|
188
|
+
|
|
189
|
+
4) Builds an igraph Graph and writes a GraphML file named f"{self.net_type}_{year}.graphml" to `self.out_path`.
|
|
190
|
+
|
|
191
|
+
Expected dataframe columns and behavior:
|
|
192
|
+
- publication_year: int; used for time-window filtering.
|
|
193
|
+
- `_create_edge(row)` must return a list of tuples with the following order per edge: (source, target, weight, paper, title, years, topics)
|
|
194
|
+
|
|
195
|
+
where:
|
|
196
|
+
- source, target: author or country IDs (str-like),
|
|
197
|
+
- weight: typically 1 per publication-level edge,
|
|
198
|
+
- paper: publication/work identifier (e.g., OpenAlex ID),
|
|
199
|
+
- title: publication title (not used in aggregation),
|
|
200
|
+
- years: publication year (int),
|
|
201
|
+
- topics: list of topic IDs.
|
|
202
|
+
|
|
203
|
+
Single-actor rows should yield no edges.
|
|
204
|
+
|
|
205
|
+
:returns: None
|
|
206
|
+
:rtype: None
|
|
207
|
+
:raises KeyError: If required dataframe columns are missing (e.g., publication_year)
|
|
208
|
+
or if `_create_edge` omits expected fields.
|
|
209
|
+
:raises ValueError: If edge tuples do not match the expected shape/order.
|
|
210
|
+
:raises ImportError: If igraph or required serialization utilities are missing.
|
|
211
|
+
:raises OSError: For I/O errors while writing GraphML files.
|
|
212
|
+
"""
|
|
213
|
+
for year in tqdm(range(self.year_range[0], self.year_range[1] + 1, 1)):
|
|
214
|
+
entries = []
|
|
215
|
+
for idx, row in self.dataframe.query(
|
|
216
|
+
f"publication_year >= {year - self.interval} & publication_year <= {year}"
|
|
217
|
+
).iterrows():
|
|
218
|
+
entries.append(self._create_edge(row))
|
|
219
|
+
edges = [x for y in entries for x in y]
|
|
220
|
+
dftemp = pd.DataFrame(
|
|
221
|
+
edges,
|
|
222
|
+
columns=[
|
|
223
|
+
"source",
|
|
224
|
+
"target",
|
|
225
|
+
"weight",
|
|
226
|
+
"paper",
|
|
227
|
+
"title",
|
|
228
|
+
"years",
|
|
229
|
+
"topics",
|
|
230
|
+
],
|
|
231
|
+
)
|
|
232
|
+
weighted_edges = []
|
|
233
|
+
for idx, g0 in dftemp.groupby(["source", "target"]):
|
|
234
|
+
weighted_edges.append(
|
|
235
|
+
(
|
|
236
|
+
idx[0],
|
|
237
|
+
idx[1],
|
|
238
|
+
g0.shape[0],
|
|
239
|
+
list(g0.paper.unique()),
|
|
240
|
+
list(g0.years.unique()),
|
|
241
|
+
list(set([x for y in list(g0.topics.values) for x in y])),
|
|
242
|
+
)
|
|
243
|
+
)
|
|
244
|
+
graph = ig.Graph.TupleList(
|
|
245
|
+
weighted_edges, edge_attrs=["weight", "paper", "years", "topics"]
|
|
246
|
+
)
|
|
247
|
+
write_graphml_with_json(
|
|
248
|
+
graph, self.out_path / f"{self.net_type}_{year}.graphml"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class CalculateAICI:
|
|
253
|
+
"""Calculate adjusted internationalization collaboration index.
|
|
254
|
+
|
|
255
|
+
Input dataframe generated by `collabnet.data.OpenAlex`.
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
def __init__(self, dataframe: pd.DataFrame) -> None:
|
|
259
|
+
"""Init class."""
|
|
260
|
+
self.dataframe = dataframe
|
|
261
|
+
|
|
262
|
+
def _check_country_exist(self, row: dict) -> bool:
|
|
263
|
+
if row["countries"]:
|
|
264
|
+
return True
|
|
265
|
+
return False
|
|
266
|
+
|
|
267
|
+
def _check_no_country_exist(self, row: dict) -> bool:
|
|
268
|
+
if not row["countries"]:
|
|
269
|
+
return True
|
|
270
|
+
return False
|
|
271
|
+
|
|
272
|
+
def _check_is_international(self, row: dict) -> bool:
|
|
273
|
+
if row["countries"] and len(set(row["countries"])) > 1:
|
|
274
|
+
return True
|
|
275
|
+
else:
|
|
276
|
+
return False
|
|
277
|
+
|
|
278
|
+
def _generate_df(self, dataframe: pd.DataFrame) -> pd.DataFrame:
|
|
279
|
+
"""Calculate AICI for the dataframe."""
|
|
280
|
+
article_per_year = dataframe.groupby("publication_year").size().to_dict()
|
|
281
|
+
affil_per_year = {}
|
|
282
|
+
no_affil_per_year = {}
|
|
283
|
+
is_international_per_year = {}
|
|
284
|
+
for idx, g0 in dataframe.groupby("publication_year"):
|
|
285
|
+
affil_per_year.update(
|
|
286
|
+
{
|
|
287
|
+
idx: int(
|
|
288
|
+
g0.apply(lambda x: self._check_country_exist(x), axis=1).sum()
|
|
289
|
+
)
|
|
290
|
+
}
|
|
291
|
+
)
|
|
292
|
+
no_affil_per_year.update(
|
|
293
|
+
{
|
|
294
|
+
idx: int(
|
|
295
|
+
g0.apply(
|
|
296
|
+
lambda x: self._check_no_country_exist(x), axis=1
|
|
297
|
+
).sum()
|
|
298
|
+
)
|
|
299
|
+
}
|
|
300
|
+
)
|
|
301
|
+
is_international_per_year.update(
|
|
302
|
+
{
|
|
303
|
+
idx: int(
|
|
304
|
+
g0.apply(
|
|
305
|
+
lambda x: self._check_is_international(x), axis=1
|
|
306
|
+
).sum()
|
|
307
|
+
)
|
|
308
|
+
}
|
|
309
|
+
)
|
|
310
|
+
aici_df = pd.DataFrame(
|
|
311
|
+
[
|
|
312
|
+
article_per_year,
|
|
313
|
+
affil_per_year,
|
|
314
|
+
no_affil_per_year,
|
|
315
|
+
is_international_per_year,
|
|
316
|
+
]
|
|
317
|
+
).T.rename(
|
|
318
|
+
columns={0: "papers", 1: "with_affil", 2: "no_affil", 3: "is_international"}
|
|
319
|
+
)
|
|
320
|
+
aici_df = aici_df.reset_index().rename(columns={"index": "year"})
|
|
321
|
+
return aici_df
|
|
322
|
+
|
|
323
|
+
def complete_df(self) -> pd.DataFrame:
|
|
324
|
+
"""Calculate values for full dataframe."""
|
|
325
|
+
dftemp = self._generate_df(self.dataframe)
|
|
326
|
+
dftemp.insert(0, "level", "global")
|
|
327
|
+
return dftemp
|
|
328
|
+
|
|
329
|
+
def country_df(self, country: str) -> pd.DataFrame:
|
|
330
|
+
"""Calculate values for one country."""
|
|
331
|
+
country_mask = self.dataframe.countries.apply(
|
|
332
|
+
lambda x: True if country in x else False
|
|
333
|
+
)
|
|
334
|
+
dataframe = self.dataframe[country_mask].reset_index(drop=True)
|
|
335
|
+
dftemp = self._generate_df(dataframe)
|
|
336
|
+
dftemp.insert(0, "level", country)
|
|
337
|
+
return dftemp
|
|
338
|
+
|
|
339
|
+
def country_compare(self, country_list: list) -> pd.DataFrame:
|
|
340
|
+
"""Create data to compare country AICI."""
|
|
341
|
+
complete_df = self.complete_df()
|
|
342
|
+
aici_complete = complete_df["is_international"]/complete_df["with_affil"]
|
|
343
|
+
complete_df.insert(0, "aici", aici_complete)
|
|
344
|
+
norm_series = complete_df[["year", "with_affil"]].set_index("year")["with_affil"]
|
|
345
|
+
df_list = [complete_df]
|
|
346
|
+
for country in country_list:
|
|
347
|
+
country_df = self.country_df(country)
|
|
348
|
+
aici_cnty = country_df.set_index("year")["is_international"]/norm_series
|
|
349
|
+
country_df = country_df.merge(
|
|
350
|
+
aici_cnty.to_frame("aici"), left_on="year", right_index=True
|
|
351
|
+
)
|
|
352
|
+
df_list.append(country_df)
|
|
353
|
+
data = pd.concat(df_list, ignore_index=True)
|
|
354
|
+
return data
|
collabnet/data.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# Transform source data to required format.
|
|
2
|
+
# Source data can come from OpenAlex by using the utilites module.
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import pyalex
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TransformOA:
|
|
13
|
+
"""Read and transform Open Alex data.
|
|
14
|
+
|
|
15
|
+
File list should be a list of file paths.
|
|
16
|
+
Source files have to be in JSON format.
|
|
17
|
+
|
|
18
|
+
Using TransformOA.run() has the following workflow:
|
|
19
|
+
1) Open the UTF-8 encoded JSON file.
|
|
20
|
+
2) Load a list of OpenAlex Work records.
|
|
21
|
+
3) Normalize each record via `_process_entry`.
|
|
22
|
+
4) Create a pandas DataFrame and write it as JSON to `self.out_path`
|
|
23
|
+
using the same base filename.
|
|
24
|
+
|
|
25
|
+
Side effects:
|
|
26
|
+
- Writes a JSON file to `<self.out_path>/<file_path.name>`.
|
|
27
|
+
|
|
28
|
+
Note:
|
|
29
|
+
- UnicodeDecodeError from the JSON files are caught and logged.
|
|
30
|
+
- Other I/O and JSON parsing errors will propagate to the caller.
|
|
31
|
+
|
|
32
|
+
:param src_files: List of source file paths to JSON files containing OpenAlex records.
|
|
33
|
+
:type src_files: list[str] | list[pathlib.Path]
|
|
34
|
+
:param out_path: Directory where transformed outputs will be written. Defaults to the current directory.
|
|
35
|
+
:type out_path: str | pathlib.Path, optional
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, src_files: list, out_path: str = "."):
|
|
39
|
+
"""Init class."""
|
|
40
|
+
self.src_files = src_files
|
|
41
|
+
self.out_path = out_path
|
|
42
|
+
|
|
43
|
+
def _getAuthorAffilID(self, authors) -> tuple:
|
|
44
|
+
"""Extract author and affiliation identifiers and associated country codes.
|
|
45
|
+
|
|
46
|
+
Expects the OpenAlex "authorships" structure:
|
|
47
|
+
- Each item should contain an "author" dict with an "id".
|
|
48
|
+
- Each item should contain an "institutions" list; each institution may have "id" and "country_code".
|
|
49
|
+
|
|
50
|
+
Returns a tuple:
|
|
51
|
+
- List of (author_id, [institution_ids]) pairs.
|
|
52
|
+
- List of country codes (one per institution encountered, duplicates possible).
|
|
53
|
+
|
|
54
|
+
Note: Missing keys are returned as empty dicts ({}).
|
|
55
|
+
|
|
56
|
+
:param authors: List of authorship entries from an OpenAlex Work.
|
|
57
|
+
:type authors: list[dict]
|
|
58
|
+
:returns: (author_affils, countries)
|
|
59
|
+
where author_affils is list[tuple[str | dict, list[str | dict]]]
|
|
60
|
+
and countries is list[str | dict]
|
|
61
|
+
:rtype: tuple[list[tuple[str | dict, list[str | dict]]], list[str | dict]]
|
|
62
|
+
:raises TypeError: If `authors` is not iterable.
|
|
63
|
+
"""
|
|
64
|
+
result = []
|
|
65
|
+
countries = []
|
|
66
|
+
for author in authors:
|
|
67
|
+
idx = author.get("author", {}).get("id", {})
|
|
68
|
+
instList = []
|
|
69
|
+
institutions = author.get("institutions")
|
|
70
|
+
for ins in institutions:
|
|
71
|
+
instList.append(ins.get("id", {}))
|
|
72
|
+
countries.append(ins.get("country_code", {}))
|
|
73
|
+
result.append((idx, instList))
|
|
74
|
+
return result, countries
|
|
75
|
+
|
|
76
|
+
def _getTopicID(self, topics) -> list:
|
|
77
|
+
"""Collect topic identifiers from an OpenAlex Work.
|
|
78
|
+
|
|
79
|
+
Expects a list of topic dicts, each with an "id" key.
|
|
80
|
+
|
|
81
|
+
Note: Missing keys are returned as empty dicts ({}).
|
|
82
|
+
|
|
83
|
+
:param topics: Topics section from an OpenAlex Work.
|
|
84
|
+
:type topics: list[dict]
|
|
85
|
+
:returns: List of topic IDs.
|
|
86
|
+
:rtype: list[str | dict]
|
|
87
|
+
:raises TypeError: If `topics` is not iterable.
|
|
88
|
+
"""
|
|
89
|
+
result = []
|
|
90
|
+
for topic in topics:
|
|
91
|
+
result.append(topic.get("id", {}))
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
def _getJournalID(self, publication_location) -> str | None:
|
|
95
|
+
"""Return the source (journal) identifier from a Work's primary location.
|
|
96
|
+
|
|
97
|
+
Expects the "primary_location" structure:
|
|
98
|
+
- A dict containing a "source" dict with an "id" key.
|
|
99
|
+
|
|
100
|
+
:param publication_location: The Work's primary_location object.
|
|
101
|
+
:type publication_location: dict
|
|
102
|
+
:returns: Source ID if available; otherwise None.
|
|
103
|
+
:rtype: str | None
|
|
104
|
+
"""
|
|
105
|
+
source = publication_location.get("source", {})
|
|
106
|
+
if source:
|
|
107
|
+
return source.get("id", {})
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
def _process_entry(self, work: dict) -> dict:
|
|
111
|
+
"""Normalize a single OpenAlex Work record.
|
|
112
|
+
|
|
113
|
+
Extracts a subset of fields from the raw work, reconstructs the abstract
|
|
114
|
+
from the abstract_inverted_index, replaces it with a plain-text abstract,
|
|
115
|
+
and enriches the record with processed authorships, countries, topics, and
|
|
116
|
+
primary_location identifiers.
|
|
117
|
+
|
|
118
|
+
Required input keys in `work` for transformations:
|
|
119
|
+
- abstract_inverted_index
|
|
120
|
+
- authorships
|
|
121
|
+
- topics
|
|
122
|
+
- primary_location
|
|
123
|
+
|
|
124
|
+
Output keys in the returned dict:
|
|
125
|
+
- abstract (reconstructed from abstract_inverted_index)
|
|
126
|
+
- authorships (processed via _getAuthorAffilID)
|
|
127
|
+
- countries (derived from authors' affiliations)
|
|
128
|
+
- topics (processed via _getTopicID)
|
|
129
|
+
- primary_location (processed via _getJournalID)
|
|
130
|
+
|
|
131
|
+
The keys id, doi, title, type, publication_year and referenced_works are copied.
|
|
132
|
+
|
|
133
|
+
:param work: Raw OpenAlex Work object as returned by the API.
|
|
134
|
+
:type work: dict
|
|
135
|
+
:returns: Normalized work record with selected fields and derived values.
|
|
136
|
+
:rtype: dict
|
|
137
|
+
:raises TypeError: If `work` is not a mapping-like object.
|
|
138
|
+
:raises KeyError: If any required field is missing from `work`.
|
|
139
|
+
"""
|
|
140
|
+
data_keys = [
|
|
141
|
+
"doi",
|
|
142
|
+
"title",
|
|
143
|
+
"type",
|
|
144
|
+
"publication_year",
|
|
145
|
+
"abstract_inverted_index",
|
|
146
|
+
"referenced_works",
|
|
147
|
+
]
|
|
148
|
+
result = dict()
|
|
149
|
+
result = {"id": work["id"]}
|
|
150
|
+
for key in data_keys:
|
|
151
|
+
result.update({key: work[key]})
|
|
152
|
+
tempWork = pyalex.Work(result)
|
|
153
|
+
abstract = tempWork["abstract"]
|
|
154
|
+
result.update({"abstract": abstract})
|
|
155
|
+
result.pop("abstract_inverted_index", None)
|
|
156
|
+
authors, countries = self._getAuthorAffilID(work["authorships"])
|
|
157
|
+
result.update({"authorships": authors})
|
|
158
|
+
result.update({"countries": countries})
|
|
159
|
+
result.update({"topics": self._getTopicID(work["topics"])})
|
|
160
|
+
result.update(
|
|
161
|
+
{"primary_location": self._getJournalID(work["primary_location"])}
|
|
162
|
+
)
|
|
163
|
+
return result
|
|
164
|
+
|
|
165
|
+
def _process_file(self, file_path: Path) -> None:
|
|
166
|
+
"""Transform one source JSON file into the normalized output format.
|
|
167
|
+
|
|
168
|
+
:param file_path: Path to a UTF-8 encoded JSON file containing a list of
|
|
169
|
+
OpenAlex Work objects.
|
|
170
|
+
:type file_path: pathlib.Path
|
|
171
|
+
:returns: None
|
|
172
|
+
:rtype: None
|
|
173
|
+
:raises FileNotFoundError: If the input file does not exist.
|
|
174
|
+
:raises json.JSONDecodeError: If the file content is not valid JSON.
|
|
175
|
+
:raises OSError: For general I/O errors while reading or writing.
|
|
176
|
+
"""
|
|
177
|
+
with open(Path(file_path), encoding="utf-8") as infile:
|
|
178
|
+
try:
|
|
179
|
+
data = json.load(infile)
|
|
180
|
+
result = [self._process_entry(work) for work in data]
|
|
181
|
+
dftemp = pd.DataFrame(result)
|
|
182
|
+
dftemp.to_json(f"{self.out_path}{file_path.name}")
|
|
183
|
+
except UnicodeDecodeError as r:
|
|
184
|
+
print("error in file", r)
|
|
185
|
+
|
|
186
|
+
def run(self) -> None:
|
|
187
|
+
"""Process all source files defined in `self.src_files`.
|
|
188
|
+
|
|
189
|
+
Iterates over `self.src_files` with a progress bar and invokes
|
|
190
|
+
`_process_file` for each entry.
|
|
191
|
+
|
|
192
|
+
:returns: None
|
|
193
|
+
:rtype: None
|
|
194
|
+
"""
|
|
195
|
+
for file_path in tqdm(self.src_files):
|
|
196
|
+
self._process_file(file_path)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def join_to_df(
|
|
200
|
+
src_folder: Path, out_folder: Path = Path("."), selection: str = "*.json"
|
|
201
|
+
) -> pd.DataFrame:
|
|
202
|
+
"""Create combined dataframe from JSON files.
|
|
203
|
+
Entries are de-duplicated based on their work ID.
|
|
204
|
+
|
|
205
|
+
If only specific JSON files should be joined,
|
|
206
|
+
specify a regex pattern like 'works_s\*.json'.
|
|
207
|
+
Supports glob to search files in subfolders, e.g.
|
|
208
|
+
'\**/\*.json' for all JSON files in all subfolders.
|
|
209
|
+
|
|
210
|
+
:param src_folder: Root directory to search for input JSON files.
|
|
211
|
+
:type src_folder: pathlib.Path
|
|
212
|
+
:param out_folder: Directory where any outputs or artifacts may be written.
|
|
213
|
+
Defaults to the current directory.
|
|
214
|
+
:type out_folder: pathlib.Path, optional
|
|
215
|
+
:param selection: Glob pattern used to select input JSON files relative to
|
|
216
|
+
src_folder. Defaults to '\*.json'.
|
|
217
|
+
:type selection: str, optional
|
|
218
|
+
:returns: Combined dataframe of unique works from the selected JSON files.
|
|
219
|
+
:rtype: pandas.DataFrame
|
|
220
|
+
"""
|
|
221
|
+
file_paths = list(src_folder.glob(selection))
|
|
222
|
+
df_list = []
|
|
223
|
+
for filepath in file_paths:
|
|
224
|
+
data = pd.read_json(filepath)
|
|
225
|
+
df_list.append(data)
|
|
226
|
+
data = pd.concat(df_list, ignore_index=True)
|
|
227
|
+
dedup_data = data.drop_duplicates(subset="id")
|
|
228
|
+
dedup_data.to_json(out_folder / "joined_data.json", lines=True, orient="records")
|
|
229
|
+
print(f"Deduplicated {data.shape[0] - dedup_data.shape[0]} entries.")
|
|
230
|
+
return dedup_data
|
collabnet/utils.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# Query OA for works in relation to a journal or topic
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pyalex
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
from validator_collection import validators
|
|
9
|
+
|
|
10
|
+
pyalex.config.max_retries = 1
|
|
11
|
+
pyalex.config.retry_backoff_factor = 0.1
|
|
12
|
+
pyalex.config.retry_http_codes = [429, 500, 503]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class QueryOA:
|
|
16
|
+
"""Query Open Alex to receive publication records.
|
|
17
|
+
|
|
18
|
+
:param config_email: Contact email used for OpenAlex polite API requests.
|
|
19
|
+
:type config_email: str
|
|
20
|
+
:param query_list: List of query terms or identifiers to query against OpenAlex.
|
|
21
|
+
:type query_list: list
|
|
22
|
+
:param year_range: Inclusive (start_year, end_year) for filtering publication years.
|
|
23
|
+
:type year_range: tuple[int, int]
|
|
24
|
+
:param out_path: Directory where results/artifacts should be written. Defaults to the current directory.
|
|
25
|
+
:type out_path: pathlib.Path, optional
|
|
26
|
+
:param query_type: Type of query to perform ("journal", "topic", or "institution"). Defaults to "topic".
|
|
27
|
+
:type query_type: str, optional
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
config_email: str,
|
|
33
|
+
query_list: list,
|
|
34
|
+
year_range: tuple,
|
|
35
|
+
out_path: Path = Path("."),
|
|
36
|
+
query_type: str = "topic",
|
|
37
|
+
n_max: int | None = None,
|
|
38
|
+
):
|
|
39
|
+
assert query_type in ["topic", "journal", "institution"], (
|
|
40
|
+
"Use supported query type: topic, journal or institution"
|
|
41
|
+
)
|
|
42
|
+
if query_type == "topic":
|
|
43
|
+
assert all([x.startswith("t") for x in query_list]), (
|
|
44
|
+
"Provide list of correct topic IDs starting with t."
|
|
45
|
+
)
|
|
46
|
+
elif query_type == "journal":
|
|
47
|
+
assert all([x.startswith("s") for x in query_list]), (
|
|
48
|
+
"Provide list of correct journal IDs starting with s."
|
|
49
|
+
)
|
|
50
|
+
else:
|
|
51
|
+
pass
|
|
52
|
+
assert validators.email(config_email), "Provide a valid email address."
|
|
53
|
+
pyalex.config.email = config_email
|
|
54
|
+
self.query_list = query_list
|
|
55
|
+
self.query_type = query_type
|
|
56
|
+
self.year_range = year_range
|
|
57
|
+
self.out_path = out_path
|
|
58
|
+
self.n_max = n_max
|
|
59
|
+
|
|
60
|
+
def run(self) -> list:
|
|
61
|
+
file_list = self._run_query(
|
|
62
|
+
self.query_list, self.query_type, self.year_range, self.out_path
|
|
63
|
+
)
|
|
64
|
+
return file_list
|
|
65
|
+
|
|
66
|
+
def _run_query(self, query_list, query_type, year_range, out_path) -> list:
|
|
67
|
+
"""Execute an OpenAlex query and collect publication records.
|
|
68
|
+
|
|
69
|
+
:param query_list: List of query terms or identifiers to query against OpenAlex.
|
|
70
|
+
:type query_list: list
|
|
71
|
+
:param query_type: Type of query to perform (e.g., "topic", "author", "source").
|
|
72
|
+
:type query_type: str
|
|
73
|
+
:param year_range: Inclusive (start_year, end_year) for filtering publication years.
|
|
74
|
+
:type year_range: tuple[int, int]
|
|
75
|
+
:param out_path: Directory where results or intermediate artifacts may be written.
|
|
76
|
+
:type out_path: pathlib.Path
|
|
77
|
+
:returns: Retrieved publication records.
|
|
78
|
+
:rtype: list[dict]
|
|
79
|
+
"""
|
|
80
|
+
generated_files = []
|
|
81
|
+
for entry in tqdm(query_list):
|
|
82
|
+
all_current_works = []
|
|
83
|
+
if query_type == "topic":
|
|
84
|
+
temp_works = self._topic_query(entry, year_range)
|
|
85
|
+
elif query_type == "journal":
|
|
86
|
+
temp_works = self._journal_query(entry, year_range)
|
|
87
|
+
elif query_type == "institution":
|
|
88
|
+
temp_works = self._affiliation_query(entry, year_range)
|
|
89
|
+
else:
|
|
90
|
+
raise TypeError("Provide query_type.")
|
|
91
|
+
for res_temp_works in temp_works:
|
|
92
|
+
all_current_works.append(res_temp_works)
|
|
93
|
+
with open(
|
|
94
|
+
Path(out_path / f"works_{entry}.json"), "w", encoding="utf8"
|
|
95
|
+
) as json_file:
|
|
96
|
+
json.dump(
|
|
97
|
+
[x for y in all_current_works for x in y],
|
|
98
|
+
json_file,
|
|
99
|
+
ensure_ascii=False,
|
|
100
|
+
)
|
|
101
|
+
generated_files.append(Path(out_path / f"works_{entry}.json"))
|
|
102
|
+
return generated_files
|
|
103
|
+
|
|
104
|
+
def _topic_query(self, entry: str, year_range: tuple):
|
|
105
|
+
"""Run a topic-based query against OpenAlex for a single entry.
|
|
106
|
+
|
|
107
|
+
:param entry: Topic identifier to query (topic ids start with the letter t).
|
|
108
|
+
:type entry: str
|
|
109
|
+
:param year_range: Inclusive (start_year, end_year) range for filtering publication years.
|
|
110
|
+
:type year_range: tuple[int, int]
|
|
111
|
+
:returns: Publication records matching the topic and year filter.
|
|
112
|
+
:rtype: list[dict]
|
|
113
|
+
"""
|
|
114
|
+
temp_works = (
|
|
115
|
+
pyalex.Works()
|
|
116
|
+
.filter(
|
|
117
|
+
topics={"id": entry},
|
|
118
|
+
from_publication_date=f"{year_range[0]}-01-01",
|
|
119
|
+
to_publication_date=f"{year_range[1]}-12-31",
|
|
120
|
+
)
|
|
121
|
+
.paginate(per_page=200, n_max=self.n_max)
|
|
122
|
+
)
|
|
123
|
+
return temp_works
|
|
124
|
+
|
|
125
|
+
def _journal_query(self, entry: str, year_range: tuple):
|
|
126
|
+
"""Run a journal-based query against OpenAlex for a single entry.
|
|
127
|
+
|
|
128
|
+
:param entry: Journal identifier to query (journal ids start with the letter s).
|
|
129
|
+
:type entry: str
|
|
130
|
+
:param year_range: Inclusive (start_year, end_year) range for filtering publication years.
|
|
131
|
+
:type year_range: tuple[int, int]
|
|
132
|
+
:returns: Publication records matching the topic and year filter.
|
|
133
|
+
:rtype: list[dict]
|
|
134
|
+
"""
|
|
135
|
+
temp_works = (
|
|
136
|
+
pyalex.Works()
|
|
137
|
+
.filter(
|
|
138
|
+
primary_location={"source": {"id": entry}},
|
|
139
|
+
from_publication_date=f"{year_range[0]}-01-01",
|
|
140
|
+
to_publication_date=f"{year_range[1]}-12-31",
|
|
141
|
+
)
|
|
142
|
+
.paginate(per_page=200, n_max=self.n_max)
|
|
143
|
+
)
|
|
144
|
+
return temp_works
|
|
145
|
+
|
|
146
|
+
def _affiliation_query(self, entry: str, year_range: tuple):
|
|
147
|
+
"""Run a institution-based query against OpenAlex for a single entry.
|
|
148
|
+
|
|
149
|
+
:param entry: ROR ID to query (See ror.org for search options).
|
|
150
|
+
:type entry: str
|
|
151
|
+
:param year_range: Inclusive (start_year, end_year) range for filtering publication years.
|
|
152
|
+
:type year_range: tuple[int, int]
|
|
153
|
+
:returns: Publication records matching the topic and year filter.
|
|
154
|
+
:rtype: list[dict]
|
|
155
|
+
"""
|
|
156
|
+
temp_works = (
|
|
157
|
+
pyalex.Works()
|
|
158
|
+
.filter(
|
|
159
|
+
authorships={"institutions": {"ror": entry}},
|
|
160
|
+
from_publication_date=f"{year_range[0]}-01-01",
|
|
161
|
+
to_publication_date=f"{year_range[1]}-12-31",
|
|
162
|
+
)
|
|
163
|
+
.paginate(per_page=200, n_max=self.n_max)
|
|
164
|
+
)
|
|
165
|
+
return temp_works
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: collabnet
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Build collaboration networks from publication records.
|
|
5
|
+
Author: Malte Vogl
|
|
6
|
+
Author-email: Malte Vogl <vogl@gea.mpg.de>
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Dist: pyalex
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Requires-Dist: igraph
|
|
13
|
+
Requires-Dist: tqdm
|
|
14
|
+
Requires-Dist: matplotlib
|
|
15
|
+
Requires-Dist: validator-collection>=1.5.0
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Project-URL: Bug Tracker, https://gitlab.gwdg.de/mpigea/dt/collabnet/-/issues
|
|
18
|
+
Project-URL: Homepage, https://gitlab.gwdg.de/mpigea/dt/collabnet
|
|
19
|
+
Project-URL: Project Homepage, https://modelsen.gea.mpg.de
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# CollabNet
|
|
23
|
+
|
|
24
|
+
[](https://collabnet.readthedocs.io/en/latest/?badge=latest)
|
|
25
|
+
|
|
26
|
+
This Python package allows to generate co-authorship and co-country networks.
|
|
27
|
+
The source data (currently from OpenAlex) can be works for the same journal,
|
|
28
|
+
institution or topic. The data is transformed into a suitable format to facilitate
|
|
29
|
+
the network creation. Networks are generated and saved as GRAPHML files.
|
|
30
|
+
|
|
31
|
+
In addition the app allows to calculate the Adjusted International Collaboration Index,
|
|
32
|
+
as defined in Lalli 2025.
|
|
33
|
+
|
|
34
|
+

|
|
35
|
+
|
|
36
|
+
[Gephi Lite](https://lite.gephi.org) plot of Co-authorship network for 1980 with interval 3 with 738 nodes and 2402 edges.
|
|
37
|
+
Layout in ForceAtlas2 design, colors related to Louvain modularity class. Node size by degree. The selected node with label has
|
|
38
|
+
degree 16.
|
|
39
|
+
|
|
40
|
+
Documentation is available on [ReadTheDocs](https://collabnet.readthedocs.io).
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
tl;dr Use pip
|
|
45
|
+
|
|
46
|
+
~~~bash
|
|
47
|
+
pip install collabnet
|
|
48
|
+
~~~
|
|
49
|
+
|
|
50
|
+
Consider using a clean virtual environment to keep your main packages separated.
|
|
51
|
+
Create a new virtual environment and install the package
|
|
52
|
+
|
|
53
|
+
~~~bash
|
|
54
|
+
python3 -m venv env
|
|
55
|
+
source env/bin/activate
|
|
56
|
+
pip install collabnet
|
|
57
|
+
~~~
|
|
58
|
+
|
|
59
|
+
## Example
|
|
60
|
+
|
|
61
|
+
You can find an example Jupyter Notebook showing the use of package by demonstrating a query for data, transforming the data, and generate networks in the [examples folder](../example/collab_pipeline.ipynb).
|
|
62
|
+
|
|
63
|
+
## Testing
|
|
64
|
+
|
|
65
|
+
Tests can be run by running
|
|
66
|
+
|
|
67
|
+
~~~bash
|
|
68
|
+
uv run pytest --cov-report=term-missing --cov=src/collabnet
|
|
69
|
+
~~~
|
|
70
|
+
which installs the `test` dependency group and runs the files. Coverage of testing is shown.
|
|
71
|
+
|
|
72
|
+
## Building documentation
|
|
73
|
+
|
|
74
|
+
The documentation is build using _sphinx_. Install of the _doc_ dependency group is automated with `uv`:
|
|
75
|
+
|
|
76
|
+
~~~bash
|
|
77
|
+
uv run sphinx-build -c docs -b html docs docs/_build/html
|
|
78
|
+
~~~
|
|
79
|
+
|
|
80
|
+
## Funding information
|
|
81
|
+
|
|
82
|
+
This work is part of a collaboration between the department for
|
|
83
|
+
Structural Changes of the Technosphere, Max Planck Institute of Geoanthropology, Jena, Germany and
|
|
84
|
+
DIMEAS - Department of Mechanical and Aerospace Engineering, Politecnico di Torino, Torino, Italy.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
collabnet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
collabnet/analysis.py,sha256=4IzlcoSOA6WuO_IxukbAHrpPjSxkKaYFiaj9RV577Hw,13322
|
|
3
|
+
collabnet/data.py,sha256=rWq2CGF5lHRheDtrDsjflfMpy7u-PRMEJiflTYHnPk4,9201
|
|
4
|
+
collabnet/utils.py,sha256=qPL2g3YkhtASsX-fc6fgkkLBbzvDZSnzAizLKVhM-30,6598
|
|
5
|
+
collabnet-0.1.0.dist-info/WHEEL,sha256=xDCZ-UyfvkGuEHPeI7BcJzYKIZzdqN8A8o1M5Om8IyA,79
|
|
6
|
+
collabnet-0.1.0.dist-info/METADATA,sha256=rpmQ5vgrPfO9Z_nSSg2gbVT54Mm-_-KIj7S5KYe-WyU,3010
|
|
7
|
+
collabnet-0.1.0.dist-info/RECORD,,
|