gismap 0.2.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gismap/__init__.py +2 -0
- gismap/build.py +4 -0
- gismap/gisgraphs/__init__.py +0 -0
- gismap/gisgraphs/builder.py +105 -0
- gismap/{lab → gisgraphs}/graph.py +70 -66
- gismap/gisgraphs/groups.py +70 -0
- gismap/gisgraphs/js.py +190 -0
- gismap/gisgraphs/options.py +37 -0
- gismap/gisgraphs/style.py +119 -0
- gismap/gisgraphs/widget.py +145 -0
- gismap/lab/__init__.py +0 -4
- gismap/lab/egomap.py +6 -7
- gismap/lab/expansion.py +7 -6
- gismap/lab/filters.py +1 -1
- gismap/lab/lab_author.py +50 -6
- gismap/lab/labmap.py +7 -6
- gismap/lab_examples/__init__.py +0 -0
- gismap/lab_examples/cedric.py +46 -0
- gismap/lab_examples/lamsade.py +43 -0
- gismap/{lab → lab_examples}/lincs.py +2 -2
- gismap/{lab → lab_examples}/toulouse.py +20 -3
- gismap/sources/dblp.py +16 -18
- gismap/sources/dblp_ttl.py +168 -0
- gismap/sources/hal.py +19 -10
- gismap/sources/ldb.py +501 -0
- gismap/sources/models.py +7 -0
- gismap/sources/multi.py +25 -17
- gismap/utils/common.py +15 -10
- gismap/utils/logger.py +2 -0
- gismap/utils/requests.py +6 -2
- gismap/utils/zlist.py +68 -0
- {gismap-0.2.2.dist-info → gismap-0.4.0.dist-info}/METADATA +37 -8
- gismap-0.4.0.dist-info/RECORD +43 -0
- {gismap-0.2.2.dist-info → gismap-0.4.0.dist-info}/WHEEL +1 -1
- gismap/lab/vis.py +0 -329
- gismap-0.2.2.dist-info/RECORD +0 -30
- /gismap/{lab → lab_examples}/lip6.py +0 -0
- {gismap-0.2.2.dist-info → gismap-0.4.0.dist-info}/licenses/AUTHORS.md +0 -0
gismap/sources/ldb.py
ADDED
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from typing import ClassVar
|
|
4
|
+
from platformdirs import user_data_dir
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
import errno
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
import zstandard as zstd
|
|
12
|
+
import dill as pickle
|
|
13
|
+
import numpy as np
|
|
14
|
+
import numba as nb
|
|
15
|
+
from bof.fuzz import Process
|
|
16
|
+
from gismo.common import safe_write
|
|
17
|
+
from tqdm.auto import tqdm
|
|
18
|
+
import requests
|
|
19
|
+
|
|
20
|
+
from gismap.sources.dblp_ttl import publis_streamer
|
|
21
|
+
from gismap.sources.models import DB, Author, Publication
|
|
22
|
+
from gismap.utils.logger import logger
|
|
23
|
+
from gismap.utils.text import asciify
|
|
24
|
+
from gismap.utils.zlist import ZList
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
DATA_DIR = Path(user_data_dir(
|
|
28
|
+
appname="gismap",
|
|
29
|
+
appauthor=False,
|
|
30
|
+
))
|
|
31
|
+
|
|
32
|
+
LDB_STEM = "ldb"
|
|
33
|
+
|
|
34
|
+
LDB_PATH = DATA_DIR / f"{LDB_STEM}.pkl.zst"
|
|
35
|
+
|
|
36
|
+
TTL_URL = "https://dblp.org/rdf/dblp.ttl.gz"
|
|
37
|
+
|
|
38
|
+
# GitHub release asset constants
|
|
39
|
+
GITHUB_REPO = "balouf/gismap"
|
|
40
|
+
GITHUB_API_URL = f"https://api.github.com/repos/{GITHUB_REPO}/releases"
|
|
41
|
+
LDB_ASSET_NAME = "ldb.pkl.zst"
|
|
42
|
+
LDB_META_PATH = DATA_DIR / "ldb_meta.json"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(repr=False)
|
|
46
|
+
class LDB(DB):
|
|
47
|
+
"""
|
|
48
|
+
Browse DBLP from a local copy of the database.
|
|
49
|
+
|
|
50
|
+
LDB is a class-only database - it should not be instantiated.
|
|
51
|
+
All methods are classmethods and state is stored in class variables.
|
|
52
|
+
"""
|
|
53
|
+
db_name: ClassVar[str] = LDB_STEM
|
|
54
|
+
source: ClassVar[str] = TTL_URL
|
|
55
|
+
|
|
56
|
+
# Class-level state (replaces instance attributes)
|
|
57
|
+
authors: ClassVar[ZList | None] = None
|
|
58
|
+
publis: ClassVar[ZList | None] = None
|
|
59
|
+
keys: ClassVar[dict | None] = None
|
|
60
|
+
search_engine: ClassVar[Process | None] = None
|
|
61
|
+
_initialized: ClassVar[bool] = False
|
|
62
|
+
|
|
63
|
+
__hash__ = object.__hash__
|
|
64
|
+
|
|
65
|
+
def __init__(self):
|
|
66
|
+
raise TypeError(
|
|
67
|
+
"LDB should not be instantiated. Use class methods directly, e.g., LDB.search_author(name)"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def _ensure_loaded(cls):
|
|
72
|
+
"""Lazy-load the database if not already loaded."""
|
|
73
|
+
if cls._initialized:
|
|
74
|
+
return
|
|
75
|
+
if LDB_PATH.exists():
|
|
76
|
+
cls.load_db()
|
|
77
|
+
else:
|
|
78
|
+
logger.info("LDB not found locally. Attempting to retrieve from GitHub...")
|
|
79
|
+
try:
|
|
80
|
+
cls.retrieve()
|
|
81
|
+
cls.load_db()
|
|
82
|
+
except RuntimeError as e:
|
|
83
|
+
logger.warning(f"Could not auto-retrieve LDB: {e}")
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def build_db(cls, source=None, limit=None, n_range=2, length_impact=.1, authors_frame=512, publis_frame=256):
|
|
87
|
+
if source is None:
|
|
88
|
+
source = cls.source
|
|
89
|
+
authors_dict = dict()
|
|
90
|
+
logger.info("Retrieve publications")
|
|
91
|
+
with ZList(frame_size=publis_frame) as publis:
|
|
92
|
+
for i, (key, title, typ, authors, url, streams, pages, venue, year) in enumerate(publis_streamer(source)):
|
|
93
|
+
auth_indices = []
|
|
94
|
+
for auth_key, auth_name in authors.items():
|
|
95
|
+
if auth_key not in authors_dict:
|
|
96
|
+
authors_dict[auth_key] = (len(authors_dict), auth_name, [i])
|
|
97
|
+
else:
|
|
98
|
+
authors_dict[auth_key][2].append(i)
|
|
99
|
+
auth_indices.append(authors_dict[auth_key][0])
|
|
100
|
+
publis.append((key, title, typ, auth_indices, url, streams, pages, venue, year))
|
|
101
|
+
if i == limit:
|
|
102
|
+
break
|
|
103
|
+
cls.publis = publis
|
|
104
|
+
logger.info(f"{len(publis)} publications retrieved.")
|
|
105
|
+
logger.info("Compact authors")
|
|
106
|
+
with ZList(frame_size=authors_frame) as authors:
|
|
107
|
+
for key, (_, name, pubs) in tqdm(authors_dict.items()):
|
|
108
|
+
authors.append((key, name, pubs))
|
|
109
|
+
cls.authors = authors
|
|
110
|
+
cls.keys = {k: v[0] for k, v in authors_dict.items()}
|
|
111
|
+
del authors_dict
|
|
112
|
+
cls.search_engine = Process(n_range=n_range, length_impact=length_impact)
|
|
113
|
+
cls.search_engine.fit([asciify(a[1]) for a in authors])
|
|
114
|
+
cls.search_engine.choices = np.arange(len(authors))
|
|
115
|
+
cls.search_engine.vectorizer.features_ = cls.numbify_dict(cls.search_engine.vectorizer.features_)
|
|
116
|
+
logger.info(f"{len(cls.authors)} compacted.")
|
|
117
|
+
cls._invalidate_cache()
|
|
118
|
+
cls._initialized = True
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
@lru_cache(maxsize=50000)
|
|
122
|
+
def author_by_index(cls, i):
|
|
123
|
+
key, name, _ = cls.authors[i]
|
|
124
|
+
return LDBAuthor(key=key, name=name)
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def author_by_key(cls, key):
|
|
128
|
+
return cls.author_by_index(cls.keys[key])
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
@lru_cache(maxsize=50000)
|
|
132
|
+
def publication_by_index(cls, i):
|
|
133
|
+
key, title, typ, authors, url, streams, pages, venue, year = cls.publis[i]
|
|
134
|
+
if venue is None:
|
|
135
|
+
venue = "unpublished"
|
|
136
|
+
return {"key": key, "title": title, "type": typ,
|
|
137
|
+
"authors": authors,
|
|
138
|
+
"url": url, "streams": streams, "pages": pages,
|
|
139
|
+
"venue": venue, "year": year}
|
|
140
|
+
|
|
141
|
+
@classmethod
|
|
142
|
+
def author_publications(cls, key):
|
|
143
|
+
cls._ensure_loaded()
|
|
144
|
+
_, name, pubs = cls.authors[cls.keys[key]]
|
|
145
|
+
pubs = [cls.publication_by_index(k).copy() for k in pubs]
|
|
146
|
+
auth_ids = sorted({k for p in pubs for k in p["authors"]})
|
|
147
|
+
auths = {k: cls.author_by_index(k) for k in auth_ids}
|
|
148
|
+
for pub in pubs:
|
|
149
|
+
pub["authors"] = [auths[k] for k in pub["authors"]]
|
|
150
|
+
metadata = dict()
|
|
151
|
+
for k in ["url", "streams", "pages"]:
|
|
152
|
+
v = pub.pop(k)
|
|
153
|
+
if v is not None:
|
|
154
|
+
metadata[k] = v
|
|
155
|
+
pub["metadata"] = metadata
|
|
156
|
+
return [LDBPublication(**pub) for pub in pubs]
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
@lru_cache(maxsize=1000)
|
|
160
|
+
def search_author(cls, name, limit=2, score_cutoff=40.0, slack=10.0):
|
|
161
|
+
cls._ensure_loaded()
|
|
162
|
+
res = cls.search_engine.extract(asciify(name), limit=limit, score_cutoff=score_cutoff)
|
|
163
|
+
res = [r[0] for r in res if r[1] > res[0][1] - slack]
|
|
164
|
+
sorted_ids = {i: cls.author_by_index(i) for i in sorted(res)}
|
|
165
|
+
return [sorted_ids[i] for i in res]
|
|
166
|
+
|
|
167
|
+
@classmethod
|
|
168
|
+
def _invalidate_cache(cls):
|
|
169
|
+
cls.search_author.cache_clear()
|
|
170
|
+
cls.publication_by_index.cache_clear()
|
|
171
|
+
cls.author_by_index.cache_clear()
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_author(cls, a):
|
|
175
|
+
return cls.author_publications(a.key)
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def _get_release_info(cls, tag: str | None = None) -> dict:
|
|
179
|
+
"""
|
|
180
|
+
Fetch release metadata from GitHub API.
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
tag: :class:`str`, optional
|
|
185
|
+
Specific release tag (e.g., "v0.4.0"). If None, fetches latest.
|
|
186
|
+
|
|
187
|
+
Returns
|
|
188
|
+
-------
|
|
189
|
+
:class:`dict`
|
|
190
|
+
Release metadata including tag_name and assets.
|
|
191
|
+
|
|
192
|
+
Raises
|
|
193
|
+
------
|
|
194
|
+
:class:`RuntimeError`
|
|
195
|
+
If release not found or API request fails.
|
|
196
|
+
"""
|
|
197
|
+
if tag is None:
|
|
198
|
+
url = f"{GITHUB_API_URL}/latest"
|
|
199
|
+
else:
|
|
200
|
+
url = f"{GITHUB_API_URL}/tags/{tag}"
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
response = requests.get(url, timeout=30)
|
|
204
|
+
response.raise_for_status()
|
|
205
|
+
return response.json()
|
|
206
|
+
except requests.exceptions.HTTPError as e:
|
|
207
|
+
if response.status_code == 404:
|
|
208
|
+
raise RuntimeError(f"Release not found: {tag or 'latest'}") from e
|
|
209
|
+
raise RuntimeError(f"GitHub API error: {e}") from e
|
|
210
|
+
except requests.exceptions.RequestException as e:
|
|
211
|
+
raise RuntimeError(f"Network error fetching release info: {e}") from e
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
def _download_file(cls, url: str, dest: Path, desc: str = "Downloading"):
|
|
215
|
+
"""
|
|
216
|
+
Download file with progress bar.
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
url : str
|
|
221
|
+
URL to download from.
|
|
222
|
+
dest : Path
|
|
223
|
+
Destination file path.
|
|
224
|
+
desc : str
|
|
225
|
+
Description for progress bar.
|
|
226
|
+
"""
|
|
227
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
228
|
+
|
|
229
|
+
response = requests.get(url, stream=True, timeout=30)
|
|
230
|
+
response.raise_for_status()
|
|
231
|
+
|
|
232
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
233
|
+
|
|
234
|
+
with open(dest, 'wb') as f, tqdm(
|
|
235
|
+
desc=desc,
|
|
236
|
+
total=total_size,
|
|
237
|
+
unit='B',
|
|
238
|
+
unit_scale=True,
|
|
239
|
+
unit_divisor=1024,
|
|
240
|
+
) as pbar:
|
|
241
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
242
|
+
if chunk:
|
|
243
|
+
f.write(chunk)
|
|
244
|
+
pbar.update(len(chunk))
|
|
245
|
+
|
|
246
|
+
@classmethod
|
|
247
|
+
def _save_meta(cls, tag: str, url: str, size: int):
|
|
248
|
+
"""Save version metadata to JSON file."""
|
|
249
|
+
meta = {
|
|
250
|
+
"tag": tag,
|
|
251
|
+
"url": url,
|
|
252
|
+
"size": size,
|
|
253
|
+
"downloaded_at": datetime.now(timezone.utc).isoformat(),
|
|
254
|
+
}
|
|
255
|
+
LDB_META_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
256
|
+
with open(LDB_META_PATH, 'w') as f:
|
|
257
|
+
json.dump(meta, f, indent=2)
|
|
258
|
+
|
|
259
|
+
@classmethod
|
|
260
|
+
def _load_meta(cls) -> dict | None:
|
|
261
|
+
"""Load version metadata from JSON file."""
|
|
262
|
+
if not LDB_META_PATH.exists():
|
|
263
|
+
return None
|
|
264
|
+
try:
|
|
265
|
+
with open(LDB_META_PATH, 'r') as f:
|
|
266
|
+
return json.load(f)
|
|
267
|
+
except (json.JSONDecodeError, IOError):
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def retrieve(cls, version: str | None = None, force: bool = False):
|
|
272
|
+
"""
|
|
273
|
+
Download LDB database from GitHub releases.
|
|
274
|
+
|
|
275
|
+
Parameters
|
|
276
|
+
----------
|
|
277
|
+
version: :class:`str`, optional
|
|
278
|
+
Specific release version (e.g., "v0.4.0" or "0.4.0").
|
|
279
|
+
If None, downloads from latest release.
|
|
280
|
+
force: :class:`bool`, default=False
|
|
281
|
+
Download even if same version is installed.
|
|
282
|
+
|
|
283
|
+
Examples
|
|
284
|
+
--------
|
|
285
|
+
>> LDB.retrieve() # Latest release (freshest data)
|
|
286
|
+
>> LDB.retrieve("v0.4.0") # Specific version
|
|
287
|
+
>> LDB.retrieve("0.4.0") # Also works without 'v' prefix
|
|
288
|
+
|
|
289
|
+
Raises
|
|
290
|
+
------
|
|
291
|
+
RuntimeError
|
|
292
|
+
If release or asset not found, or download fails.
|
|
293
|
+
"""
|
|
294
|
+
# Normalize version string (add "v" prefix if missing)
|
|
295
|
+
tag = None
|
|
296
|
+
if version is not None:
|
|
297
|
+
tag = version if version.startswith("v") else f"v{version}"
|
|
298
|
+
|
|
299
|
+
# Fetch release info
|
|
300
|
+
logger.info(f"Fetching release info for: {tag or 'latest'}")
|
|
301
|
+
release_info = cls._get_release_info(tag)
|
|
302
|
+
release_tag = release_info["tag_name"]
|
|
303
|
+
|
|
304
|
+
# Check if already installed (unless force=True)
|
|
305
|
+
if not force:
|
|
306
|
+
meta = cls._load_meta()
|
|
307
|
+
if meta and meta.get("tag") == release_tag and LDB_PATH.exists():
|
|
308
|
+
logger.info(f"LDB version {release_tag} already installed. Use force=True to re-download.")
|
|
309
|
+
return
|
|
310
|
+
|
|
311
|
+
# Find ldb.pkl.zst asset in release
|
|
312
|
+
assets = release_info.get("assets", [])
|
|
313
|
+
ldb_asset = None
|
|
314
|
+
for asset in assets:
|
|
315
|
+
if asset["name"] == LDB_ASSET_NAME:
|
|
316
|
+
ldb_asset = asset
|
|
317
|
+
break
|
|
318
|
+
|
|
319
|
+
if ldb_asset is None:
|
|
320
|
+
raise RuntimeError(
|
|
321
|
+
f"Asset '{LDB_ASSET_NAME}' not found in release {release_tag}. "
|
|
322
|
+
f"Available assets: {[a['name'] for a in assets]}"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
download_url = ldb_asset["browser_download_url"]
|
|
326
|
+
asset_size = ldb_asset["size"]
|
|
327
|
+
|
|
328
|
+
logger.info(f"Downloading LDB from release {release_tag} ({asset_size / 1e9:.2f} GB)")
|
|
329
|
+
|
|
330
|
+
# Download with progress bar
|
|
331
|
+
cls._download_file(download_url, LDB_PATH, desc=f"LDB {release_tag}")
|
|
332
|
+
|
|
333
|
+
# Save version metadata
|
|
334
|
+
cls._save_meta(release_tag, download_url, asset_size)
|
|
335
|
+
|
|
336
|
+
# Reset initialized flag so next access reloads
|
|
337
|
+
cls._initialized = False
|
|
338
|
+
cls._invalidate_cache()
|
|
339
|
+
|
|
340
|
+
logger.info(f"LDB {release_tag} successfully installed to {LDB_PATH}")
|
|
341
|
+
|
|
342
|
+
@classmethod
|
|
343
|
+
def db_info(cls) -> dict | None:
|
|
344
|
+
"""
|
|
345
|
+
Return installed version info.
|
|
346
|
+
|
|
347
|
+
Returns
|
|
348
|
+
-------
|
|
349
|
+
:class:`dict` or :class:`None`
|
|
350
|
+
Dictionary with tag, date, size, path; or None if not installed.
|
|
351
|
+
"""
|
|
352
|
+
meta = cls._load_meta()
|
|
353
|
+
if meta is None or not LDB_PATH.exists():
|
|
354
|
+
return None
|
|
355
|
+
|
|
356
|
+
return {
|
|
357
|
+
"tag": meta.get("tag"),
|
|
358
|
+
"downloaded_at": meta.get("downloaded_at"),
|
|
359
|
+
"size": meta.get("size"),
|
|
360
|
+
"path": str(LDB_PATH),
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
@classmethod
|
|
364
|
+
def check_update(cls) -> dict | None:
|
|
365
|
+
"""
|
|
366
|
+
Check if a newer version is available on GitHub.
|
|
367
|
+
|
|
368
|
+
Returns
|
|
369
|
+
-------
|
|
370
|
+
:class:`dict` or None
|
|
371
|
+
Dictionary with update info if available, None if up to date.
|
|
372
|
+
"""
|
|
373
|
+
try:
|
|
374
|
+
release_info = cls._get_release_info()
|
|
375
|
+
latest_tag = release_info["tag_name"]
|
|
376
|
+
|
|
377
|
+
meta = cls._load_meta()
|
|
378
|
+
current_tag = meta.get("tag") if meta else None
|
|
379
|
+
|
|
380
|
+
if current_tag == latest_tag:
|
|
381
|
+
logger.info(f"LDB is up to date: {current_tag}")
|
|
382
|
+
return None
|
|
383
|
+
|
|
384
|
+
return {
|
|
385
|
+
"current": current_tag,
|
|
386
|
+
"latest": latest_tag,
|
|
387
|
+
"message": f"Update available: {current_tag or 'not installed'} -> {latest_tag}",
|
|
388
|
+
}
|
|
389
|
+
except RuntimeError as e:
|
|
390
|
+
logger.warning(f"Could not check for updates: {e}")
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
@classmethod
|
|
394
|
+
def dump(cls, filename: str, path=".", overwrite=False):
|
|
395
|
+
"""Save class state to file."""
|
|
396
|
+
# Convert numba dict to regular dict for pickling
|
|
397
|
+
nb_dict = None
|
|
398
|
+
if cls.search_engine is not None:
|
|
399
|
+
nb_dict = cls.search_engine.vectorizer.features_
|
|
400
|
+
cls.search_engine.vectorizer.features_ = dict(nb_dict)
|
|
401
|
+
|
|
402
|
+
state = {
|
|
403
|
+
'authors': cls.authors,
|
|
404
|
+
'publis': cls.publis,
|
|
405
|
+
'keys': cls.keys,
|
|
406
|
+
'search_engine': cls.search_engine,
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
# Use safe_write pattern from gismo.common
|
|
410
|
+
destination = Path(path) / f"{Path(filename).stem}.pkl.zst"
|
|
411
|
+
if destination.exists() and not overwrite:
|
|
412
|
+
print(f"File {destination} already exists! Use overwrite option to overwrite.")
|
|
413
|
+
else:
|
|
414
|
+
with safe_write(destination) as f:
|
|
415
|
+
cctx = zstd.ZstdCompressor(level=3)
|
|
416
|
+
with cctx.stream_writer(f) as z:
|
|
417
|
+
pickle.dump(state, z, protocol=5)
|
|
418
|
+
|
|
419
|
+
# Restore numba dict
|
|
420
|
+
if cls.search_engine is not None:
|
|
421
|
+
cls.search_engine.vectorizer.features_ = nb_dict
|
|
422
|
+
|
|
423
|
+
@classmethod
|
|
424
|
+
def load(cls, filename: str, path="."):
|
|
425
|
+
"""Load class state from file."""
|
|
426
|
+
dest = Path(path) / f"{Path(filename).stem}.pkl.zst"
|
|
427
|
+
if not dest.exists():
|
|
428
|
+
dest = dest.with_suffix(".pkl")
|
|
429
|
+
if not dest.exists():
|
|
430
|
+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dest)
|
|
431
|
+
|
|
432
|
+
dctx = zstd.ZstdDecompressor()
|
|
433
|
+
with open(dest, "rb") as f, dctx.stream_reader(f) as z:
|
|
434
|
+
state = pickle.load(z)
|
|
435
|
+
|
|
436
|
+
cls.authors = state['authors']
|
|
437
|
+
cls.publis = state['publis']
|
|
438
|
+
cls.keys = state['keys']
|
|
439
|
+
cls.search_engine = state['search_engine']
|
|
440
|
+
|
|
441
|
+
if cls.search_engine is not None:
|
|
442
|
+
cls.search_engine.vectorizer.features_ = cls.numbify_dict(
|
|
443
|
+
cls.search_engine.vectorizer.features_
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
cls._invalidate_cache()
|
|
447
|
+
cls._initialized = True
|
|
448
|
+
|
|
449
|
+
@classmethod
|
|
450
|
+
def dump_db(cls):
|
|
451
|
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
452
|
+
cls.dump(LDB_STEM, path=DATA_DIR, overwrite=True)
|
|
453
|
+
|
|
454
|
+
@classmethod
|
|
455
|
+
def load_db(cls):
|
|
456
|
+
try:
|
|
457
|
+
cls.load(LDB_STEM, path=DATA_DIR)
|
|
458
|
+
except FileNotFoundError:
|
|
459
|
+
logger.warning("No LDB installed. Build or retrieve before using.")
|
|
460
|
+
|
|
461
|
+
@staticmethod
|
|
462
|
+
def delete_db():
|
|
463
|
+
if LDB_PATH.exists():
|
|
464
|
+
LDB_PATH.unlink()
|
|
465
|
+
|
|
466
|
+
@staticmethod
|
|
467
|
+
def numbify_dict(input_dict):
|
|
468
|
+
nb_dict = nb.typed.Dict.empty(key_type=nb.types.unicode_type, value_type=nb.types.int64)
|
|
469
|
+
for k, v in input_dict.items():
|
|
470
|
+
nb_dict[k] = v
|
|
471
|
+
return nb_dict
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
@dataclass(repr=False)
|
|
475
|
+
class LDBAuthor(Author, LDB):
|
|
476
|
+
key: str
|
|
477
|
+
aliases: list = field(default_factory=list)
|
|
478
|
+
|
|
479
|
+
@property
|
|
480
|
+
def url(self):
|
|
481
|
+
return f"https://dblp.org/pid/{self.key}.html"
|
|
482
|
+
|
|
483
|
+
def get_publications(self):
|
|
484
|
+
return LDB.from_author(self)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
@dataclass(repr=False)
|
|
489
|
+
class LDBPublication(Publication, LDB):
|
|
490
|
+
key: str
|
|
491
|
+
metadata: dict = field(default_factory=dict)
|
|
492
|
+
|
|
493
|
+
@property
|
|
494
|
+
def url(self):
|
|
495
|
+
return self.metadata.get("url", f"https://dblp.org/rec/{self.key}.html")
|
|
496
|
+
|
|
497
|
+
@property
|
|
498
|
+
def stream(self):
|
|
499
|
+
if "streams" in self.metadata:
|
|
500
|
+
return f'https://dblp.org/streams/{self.metadata["streams"][0]}'
|
|
501
|
+
return None
|
gismap/sources/models.py
CHANGED
gismap/sources/multi.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from bof.fuzz import
|
|
2
|
+
from bof.fuzz import jit_square_factors
|
|
3
|
+
from bof.feature_extraction import CountVectorizer
|
|
3
4
|
import numpy as np
|
|
4
5
|
|
|
5
6
|
from gismap.sources.models import Publication, Author
|
|
@@ -14,7 +15,7 @@ def score_author_source(dbauthor):
|
|
|
14
15
|
return 2
|
|
15
16
|
else:
|
|
16
17
|
return 3
|
|
17
|
-
elif dbauthor.db_name
|
|
18
|
+
elif dbauthor.db_name in ["dblp", "ldb"]:
|
|
18
19
|
return 1
|
|
19
20
|
else:
|
|
20
21
|
return 0
|
|
@@ -52,6 +53,8 @@ class SourcedAuthor(Author):
|
|
|
52
53
|
def get_publications(self, clean=True, selector=None):
|
|
53
54
|
if selector is None:
|
|
54
55
|
selector = []
|
|
56
|
+
if not isinstance(selector, list):
|
|
57
|
+
selector = [selector]
|
|
55
58
|
res = {
|
|
56
59
|
p.key: p
|
|
57
60
|
for a in self.sources
|
|
@@ -66,7 +69,7 @@ class SourcedAuthor(Author):
|
|
|
66
69
|
|
|
67
70
|
|
|
68
71
|
publication_score_rosetta = {
|
|
69
|
-
"db_name": {"dblp": 1, "hal": 2},
|
|
72
|
+
"db_name": {"dblp": 1, "ldb": 1, "hal": 2},
|
|
70
73
|
"venue": {"CoRR": -1, "unpublished": -2},
|
|
71
74
|
"type": {"conference": 1, "journal": 2},
|
|
72
75
|
}
|
|
@@ -135,7 +138,9 @@ def regroup_authors(auth_dict, pub_dict):
|
|
|
135
138
|
}
|
|
136
139
|
|
|
137
140
|
for pub in pub_dict.values():
|
|
138
|
-
pub.authors = [
|
|
141
|
+
pub.authors = [
|
|
142
|
+
redirection.get(a.key, redirection.get(a.name, a)) for a in pub.authors
|
|
143
|
+
]
|
|
139
144
|
|
|
140
145
|
|
|
141
146
|
def regroup_publications(pub_dict, threshold=85, length_impact=0.05, n_range=5):
|
|
@@ -156,19 +161,22 @@ def regroup_publications(pub_dict, threshold=85, length_impact=0.05, n_range=5):
|
|
|
156
161
|
:class:`dict`
|
|
157
162
|
Unified publications.
|
|
158
163
|
"""
|
|
164
|
+
if len(pub_dict) == 0:
|
|
165
|
+
return dict()
|
|
159
166
|
pub_list = [p for p in pub_dict.values()]
|
|
160
167
|
res = dict()
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
168
|
+
vectorizer = CountVectorizer(n_range=n_range)
|
|
169
|
+
x = vectorizer.fit_transform([p.title for p in pub_list])
|
|
170
|
+
y = x.T.tocsr()
|
|
171
|
+
jc_matrix = jit_square_factors(
|
|
172
|
+
x.indices, x.indptr, y.indices, y.indptr, len(pub_list), length_impact
|
|
173
|
+
)
|
|
174
|
+
done = np.zeros(len(pub_list), dtype=bool)
|
|
175
|
+
for i, paper in enumerate(pub_list):
|
|
176
|
+
if done[i]:
|
|
177
|
+
continue
|
|
178
|
+
locs = np.where(jc_matrix[i, :] > threshold)[0]
|
|
179
|
+
pub = SourcedPublication.from_sources([pub_list[i] for i in locs])
|
|
180
|
+
res[pub.key] = pub
|
|
181
|
+
done[locs] = True
|
|
174
182
|
return res
|
gismap/utils/common.py
CHANGED
|
@@ -30,7 +30,7 @@ def unlist(x):
|
|
|
30
30
|
return x[0] if (isinstance(x, list) and x) else x
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def get_classes(root, key="name"):
|
|
33
|
+
def get_classes(root, key="name", recurse=False):
|
|
34
34
|
"""
|
|
35
35
|
Parameters
|
|
36
36
|
----------
|
|
@@ -38,6 +38,8 @@ def get_classes(root, key="name"):
|
|
|
38
38
|
Starting class (can be abstract).
|
|
39
39
|
key: :class:`str`, default='name'
|
|
40
40
|
Attribute to look-up
|
|
41
|
+
recurse: bool, default=False
|
|
42
|
+
Recursively traverse subclasses.
|
|
41
43
|
|
|
42
44
|
Returns
|
|
43
45
|
-------
|
|
@@ -50,13 +52,16 @@ def get_classes(root, key="name"):
|
|
|
50
52
|
>>> from gismap.sources.models import DB
|
|
51
53
|
>>> subclasses = get_classes(DB, key='db_name')
|
|
52
54
|
>>> dict(sorted(subclasses.items())) # doctest: +NORMALIZE_WHITESPACE
|
|
53
|
-
{'dblp': <class 'gismap.sources.dblp.DBLP'>,
|
|
55
|
+
{'dblp': <class 'gismap.sources.dblp.DBLP'>,
|
|
56
|
+
'hal': <class 'gismap.sources.hal.HAL'>,
|
|
57
|
+
'ldb': <class 'gismap.sources.ldb.LDB'>}
|
|
54
58
|
"""
|
|
55
59
|
result = {
|
|
56
60
|
getattr(c, key): c for c in root.__subclasses__() if getattr(c, key, None)
|
|
57
61
|
}
|
|
58
|
-
|
|
59
|
-
|
|
62
|
+
if recurse:
|
|
63
|
+
for c in root.__subclasses__():
|
|
64
|
+
result.update(get_classes(c, key=key, recurse=True))
|
|
60
65
|
return result
|
|
61
66
|
|
|
62
67
|
|
|
@@ -83,20 +88,20 @@ def list_of_objects(clss, dico, default=None):
|
|
|
83
88
|
|
|
84
89
|
>>> from gismap.sources.models import DB
|
|
85
90
|
>>> subclasses = get_classes(DB, key='db_name')
|
|
86
|
-
>>> from gismap import HAL, DBLP
|
|
87
|
-
>>> list_of_objects([HAL, '
|
|
88
|
-
[<class 'gismap.sources.hal.HAL'>, <class 'gismap.sources.
|
|
91
|
+
>>> from gismap import HAL, DBLP, LDB
|
|
92
|
+
>>> list_of_objects([HAL, 'ldb'], subclasses)
|
|
93
|
+
[<class 'gismap.sources.hal.HAL'>, <class 'gismap.sources.ldb.LDB'>]
|
|
89
94
|
>>> list_of_objects(None, subclasses, [DBLP])
|
|
90
95
|
[<class 'gismap.sources.dblp.DBLP'>]
|
|
91
|
-
>>> list_of_objects(
|
|
92
|
-
[<class 'gismap.sources.
|
|
96
|
+
>>> list_of_objects(LDB, subclasses)
|
|
97
|
+
[<class 'gismap.sources.ldb.LDB'>]
|
|
93
98
|
>>> list_of_objects('hal', subclasses)
|
|
94
99
|
[<class 'gismap.sources.hal.HAL'>]
|
|
95
100
|
"""
|
|
96
101
|
if default is None:
|
|
97
102
|
default = []
|
|
98
103
|
if clss is None:
|
|
99
|
-
return default
|
|
104
|
+
return list_of_objects(clss=default, dico=dico)
|
|
100
105
|
elif isinstance(clss, str):
|
|
101
106
|
return [dico[clss]]
|
|
102
107
|
elif isinstance(clss, list):
|
gismap/utils/logger.py
CHANGED
gismap/utils/requests.py
CHANGED
|
@@ -13,7 +13,7 @@ session.headers.update(
|
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def get(url, params=None, n_trials=10):
|
|
16
|
+
def get(url, params=None, n_trials=10, verify=True):
|
|
17
17
|
"""
|
|
18
18
|
Parameters
|
|
19
19
|
----------
|
|
@@ -21,6 +21,10 @@ def get(url, params=None, n_trials=10):
|
|
|
21
21
|
Entry point to fetch.
|
|
22
22
|
params: :class:`dict`, optional
|
|
23
23
|
Get arguments (appended to URL).
|
|
24
|
+
n_trials: :class:`int`, default=10
|
|
25
|
+
Number of attempts to fetch URL.
|
|
26
|
+
verify: :class:`bool`, default=True
|
|
27
|
+
Verify certificates.
|
|
24
28
|
|
|
25
29
|
Returns
|
|
26
30
|
-------
|
|
@@ -29,7 +33,7 @@ def get(url, params=None, n_trials=10):
|
|
|
29
33
|
"""
|
|
30
34
|
for attempt in range(n_trials):
|
|
31
35
|
try:
|
|
32
|
-
r = session.get(url, params=params)
|
|
36
|
+
r = session.get(url, params=params, verify=verify)
|
|
33
37
|
if r.status_code == 429:
|
|
34
38
|
try:
|
|
35
39
|
t = int(r.headers["Retry-After"])
|