gismap 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gismap/build.py +23 -1
- gismap/gisgraphs/builder.py +23 -8
- gismap/gisgraphs/graph.py +15 -0
- gismap/gisgraphs/widget.py +28 -8
- gismap/lab/egomap.py +29 -7
- gismap/lab/expansion.py +35 -1
- gismap/lab/lab_author.py +30 -0
- gismap/lab/labmap.py +44 -3
- gismap/lab_examples/cedric.py +19 -6
- gismap/lab_examples/lamsade.py +9 -7
- gismap/lab_examples/toulouse.py +6 -2
- gismap/search.py +61 -1
- gismap/sources/dblp.py +22 -0
- gismap/sources/dblp_ttl.py +30 -11
- gismap/sources/hal.py +38 -0
- gismap/sources/ldb.py +315 -100
- gismap/sources/models.py +83 -0
- gismap/sources/multi.py +65 -0
- gismap/utils/common.py +58 -1
- gismap/utils/text.py +1 -1
- gismap/utils/zlist.py +24 -4
- {gismap-0.4.0.dist-info → gismap-0.4.1.dist-info}/METADATA +11 -9
- gismap-0.4.1.dist-info/RECORD +43 -0
- gismap-0.4.0.dist-info/RECORD +0 -43
- {gismap-0.4.0.dist-info → gismap-0.4.1.dist-info}/WHEEL +0 -0
- {gismap-0.4.0.dist-info → gismap-0.4.1.dist-info}/licenses/AUTHORS.md +0 -0
gismap/sources/ldb.py
CHANGED
|
@@ -19,27 +19,58 @@ import requests
|
|
|
19
19
|
|
|
20
20
|
from gismap.sources.dblp_ttl import publis_streamer
|
|
21
21
|
from gismap.sources.models import DB, Author, Publication
|
|
22
|
+
from gismap.utils.common import Data
|
|
22
23
|
from gismap.utils.logger import logger
|
|
23
|
-
from gismap.utils.text import
|
|
24
|
+
from gismap.utils.text import normalized_name
|
|
24
25
|
from gismap.utils.zlist import ZList
|
|
25
26
|
|
|
26
27
|
|
|
27
|
-
DATA_DIR = Path(
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
28
|
+
DATA_DIR = Path(
|
|
29
|
+
user_data_dir(
|
|
30
|
+
appname="gismap",
|
|
31
|
+
appauthor=False,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
32
34
|
LDB_STEM = "ldb"
|
|
33
|
-
|
|
34
|
-
LDB_PATH = DATA_DIR / f"{LDB_STEM}.pkl.zst"
|
|
35
|
-
|
|
36
|
-
TTL_URL = "https://dblp.org/rdf/dblp.ttl.gz"
|
|
37
|
-
|
|
38
|
-
# GitHub release asset constants
|
|
39
35
|
GITHUB_REPO = "balouf/gismap"
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
36
|
+
|
|
37
|
+
LDB_PARAMETERS = Data(
|
|
38
|
+
{
|
|
39
|
+
"search": {"limit": 3, "cutoff": 87.0, "slack": 1.0},
|
|
40
|
+
"bof": {"n_range": 2, "length_impact": 0.1},
|
|
41
|
+
"frame_size": {"authors": 512, "publis": 256},
|
|
42
|
+
"io": {
|
|
43
|
+
"source": "https://dblp.org/rdf/dblp.ttl.gz",
|
|
44
|
+
"destination": DATA_DIR / f"{LDB_STEM}.pkl.zst",
|
|
45
|
+
"metadata": DATA_DIR / f"{LDB_STEM}.json",
|
|
46
|
+
"gh_api": f"https://api.github.com/repos/{GITHUB_REPO}/releases",
|
|
47
|
+
},
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
"""
|
|
51
|
+
Global configuration parameters for the Local DBLP (LDB) pipeline.
|
|
52
|
+
|
|
53
|
+
Structure:
|
|
54
|
+
- search:
|
|
55
|
+
- limit: maximum number of candidates retrieved per query.
|
|
56
|
+
- cutoff: minimal similarity score required to keep a candidate.
|
|
57
|
+
- slack: tolerance around the cutoff for borderline matches.
|
|
58
|
+
- bof (Bag-of-Factors):
|
|
59
|
+
- n_range: max factor size (higher is better but more expensive).
|
|
60
|
+
- length_impact: how to compare two inputs of different size.
|
|
61
|
+
- frame_size:
|
|
62
|
+
- authors: maximum number of authors kept in a single frame/batch.
|
|
63
|
+
- publis: maximum number of publications kept in a single frame/batch.
|
|
64
|
+
- io:
|
|
65
|
+
- source: URL/file location of the DBLP RDF dump used as raw input.
|
|
66
|
+
- destination: local path where the compressed preprocessed dataset is / will be stored.
|
|
67
|
+
- gh_api: GitHub API endpoint used to fetch release information for the project.
|
|
68
|
+
|
|
69
|
+
LDB_PARAMETERS is a Data (RecursiveDict) instance, so nested fields can be
|
|
70
|
+
accessed with attribute notation, e.g.:
|
|
71
|
+
LDB_PARAMETERS.search.limit
|
|
72
|
+
LDB_PARAMETERS.io.destination
|
|
73
|
+
"""
|
|
43
74
|
|
|
44
75
|
|
|
45
76
|
@dataclass(repr=False)
|
|
@@ -49,9 +80,30 @@ class LDB(DB):
|
|
|
49
80
|
|
|
50
81
|
LDB is a class-only database - it should not be instantiated.
|
|
51
82
|
All methods are classmethods and state is stored in class variables.
|
|
83
|
+
|
|
84
|
+
Examples
|
|
85
|
+
--------
|
|
86
|
+
|
|
87
|
+
Public DB methods ensure that the DB is loaded but if you need to use a specific LDB method, prepare the DB first.
|
|
88
|
+
|
|
89
|
+
>>> LDB._ensure_loaded()
|
|
90
|
+
>>> LDB.author_by_key("66/2077")
|
|
91
|
+
LDBAuthor(name='Fabien Mathieu', key='66/2077')
|
|
92
|
+
>>> pubs = sorted(LDB.author_publications('66/2077'), key = lambda p: p.year)
|
|
93
|
+
>>> pub = pubs[0]
|
|
94
|
+
>>> pub.metadata
|
|
95
|
+
{'url': 'http://www2003.org/cdrom/papers/poster/p102/p102-mathieu.htm', 'streams': ['conf/www']}
|
|
96
|
+
>>> LDB.db_info() # doctest: +ELLIPSIS
|
|
97
|
+
{'tag': 'v0.4.0', 'downloaded_at': '2026-...', 'size': ..., 'path': ...}
|
|
98
|
+
>>> LDB.check_update()
|
|
99
|
+
>>> ldb = LDB()
|
|
100
|
+
Traceback (most recent call last):
|
|
101
|
+
...
|
|
102
|
+
TypeError: LDB should not be instantiated. Use class methods directly, e.g., LDB.search_author(name)
|
|
52
103
|
"""
|
|
104
|
+
|
|
53
105
|
db_name: ClassVar[str] = LDB_STEM
|
|
54
|
-
|
|
106
|
+
parameters: ClassVar[Data] = LDB_PARAMETERS
|
|
55
107
|
|
|
56
108
|
# Class-level state (replaces instance attributes)
|
|
57
109
|
authors: ClassVar[ZList | None] = None
|
|
@@ -72,24 +124,81 @@ class LDB(DB):
|
|
|
72
124
|
"""Lazy-load the database if not already loaded."""
|
|
73
125
|
if cls._initialized:
|
|
74
126
|
return
|
|
75
|
-
if
|
|
76
|
-
cls.load_db()
|
|
77
|
-
else:
|
|
127
|
+
if not cls.parameters.io.destination.exists():
|
|
78
128
|
logger.info("LDB not found locally. Attempting to retrieve from GitHub...")
|
|
79
129
|
try:
|
|
80
130
|
cls.retrieve()
|
|
81
|
-
cls.load_db()
|
|
82
131
|
except RuntimeError as e:
|
|
83
132
|
logger.warning(f"Could not auto-retrieve LDB: {e}")
|
|
133
|
+
cls.load_db()
|
|
84
134
|
|
|
85
135
|
@classmethod
|
|
86
|
-
def build_db(cls,
|
|
87
|
-
|
|
88
|
-
|
|
136
|
+
def build_db(cls, limit=None):
|
|
137
|
+
"""
|
|
138
|
+
Build the LDB database from a DBLP TTL dump.
|
|
139
|
+
|
|
140
|
+
Parses the DBLP RDF/TTL file to extract publications and authors,
|
|
141
|
+
stores them in compressed ZList structures, and builds a fuzzy
|
|
142
|
+
search engine for author name lookups.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
limit: :class:`int`, optional
|
|
147
|
+
Maximum number of publications to process. If None, processes
|
|
148
|
+
the entire database. Useful for testing with a subset.
|
|
149
|
+
|
|
150
|
+
Notes
|
|
151
|
+
-----
|
|
152
|
+
This method populates the class-level attributes:
|
|
153
|
+
|
|
154
|
+
- ``authors``: ZList of (key, name, publication_indices) tuples
|
|
155
|
+
- ``publis``: ZList of publication records
|
|
156
|
+
- ``keys``: dict mapping author keys to indices
|
|
157
|
+
- ``search_engine``: fuzzy search Process for author lookups
|
|
158
|
+
|
|
159
|
+
After building, call :meth:`dump_db` to persist the database.
|
|
160
|
+
|
|
161
|
+
Examples
|
|
162
|
+
--------
|
|
163
|
+
Build from the default DBLP source:
|
|
164
|
+
|
|
165
|
+
>>> LDB.build_db() # doctest: +SKIP
|
|
166
|
+
>>> LDB.dump_db() # doctest: +SKIP
|
|
167
|
+
|
|
168
|
+
Build a small test database:
|
|
169
|
+
|
|
170
|
+
>>> LDB.build_db(limit=1000)
|
|
171
|
+
>>> LDB.authors[0]
|
|
172
|
+
('78/459-1', 'Manish Singh', [0])
|
|
173
|
+
|
|
174
|
+
Save your build in a non-default file:
|
|
175
|
+
|
|
176
|
+
>>> from tempfile import TemporaryDirectory
|
|
177
|
+
>>> from pathlib import Path
|
|
178
|
+
>>> with TemporaryDirectory() as tmpdirname:
|
|
179
|
+
... LDB.dump(filename="test.zst", path=tmpdirname)
|
|
180
|
+
... [file.name for file in Path(tmpdirname).glob("*")]
|
|
181
|
+
['test.zst']
|
|
182
|
+
|
|
183
|
+
In case you don't like your build and want to reload your local database from disk:
|
|
184
|
+
|
|
185
|
+
>>> LDB.load_db()
|
|
186
|
+
"""
|
|
187
|
+
source = cls.parameters.io.source
|
|
89
188
|
authors_dict = dict()
|
|
90
189
|
logger.info("Retrieve publications")
|
|
91
|
-
with ZList(frame_size=
|
|
92
|
-
for i, (
|
|
190
|
+
with ZList(frame_size=cls.parameters.frame_size.publis) as publis:
|
|
191
|
+
for i, (
|
|
192
|
+
key,
|
|
193
|
+
title,
|
|
194
|
+
typ,
|
|
195
|
+
authors,
|
|
196
|
+
url,
|
|
197
|
+
streams,
|
|
198
|
+
pages,
|
|
199
|
+
venue,
|
|
200
|
+
year,
|
|
201
|
+
) in enumerate(publis_streamer(source)):
|
|
93
202
|
auth_indices = []
|
|
94
203
|
for auth_key, auth_name in authors.items():
|
|
95
204
|
if auth_key not in authors_dict:
|
|
@@ -97,26 +206,37 @@ class LDB(DB):
|
|
|
97
206
|
else:
|
|
98
207
|
authors_dict[auth_key][2].append(i)
|
|
99
208
|
auth_indices.append(authors_dict[auth_key][0])
|
|
100
|
-
publis.append(
|
|
209
|
+
publis.append(
|
|
210
|
+
(key, title, typ, auth_indices, url, streams, pages, venue, year)
|
|
211
|
+
)
|
|
101
212
|
if i == limit:
|
|
102
213
|
break
|
|
103
214
|
cls.publis = publis
|
|
104
215
|
logger.info(f"{len(publis)} publications retrieved.")
|
|
105
216
|
logger.info("Compact authors")
|
|
106
|
-
with ZList(frame_size=
|
|
217
|
+
with ZList(frame_size=cls.parameters.frame_size.authors) as authors:
|
|
107
218
|
for key, (_, name, pubs) in tqdm(authors_dict.items()):
|
|
108
219
|
authors.append((key, name, pubs))
|
|
109
220
|
cls.authors = authors
|
|
110
221
|
cls.keys = {k: v[0] for k, v in authors_dict.items()}
|
|
111
222
|
del authors_dict
|
|
112
|
-
cls.
|
|
113
|
-
cls.search_engine.fit([asciify(a[1]) for a in authors])
|
|
114
|
-
cls.search_engine.choices = np.arange(len(authors))
|
|
115
|
-
cls.search_engine.vectorizer.features_ = cls.numbify_dict(cls.search_engine.vectorizer.features_)
|
|
116
|
-
logger.info(f"{len(cls.authors)} compacted.")
|
|
223
|
+
cls._build_search_engine()
|
|
117
224
|
cls._invalidate_cache()
|
|
118
225
|
cls._initialized = True
|
|
119
226
|
|
|
227
|
+
@classmethod
|
|
228
|
+
def _build_search_engine(cls):
|
|
229
|
+
cls.search_engine = Process(
|
|
230
|
+
n_range=cls.parameters.bof.n_range,
|
|
231
|
+
length_impact=cls.parameters.bof.length_impact,
|
|
232
|
+
)
|
|
233
|
+
cls.search_engine.fit([normalized_name(a[1]) for a in cls.authors])
|
|
234
|
+
cls.search_engine.choices = np.arange(len(cls.authors))
|
|
235
|
+
cls.search_engine.vectorizer.features_ = cls.numbify_dict(
|
|
236
|
+
cls.search_engine.vectorizer.features_
|
|
237
|
+
)
|
|
238
|
+
logger.info(f"{len(cls.authors)} authors indexed.")
|
|
239
|
+
|
|
120
240
|
@classmethod
|
|
121
241
|
@lru_cache(maxsize=50000)
|
|
122
242
|
def author_by_index(cls, i):
|
|
@@ -133,10 +253,17 @@ class LDB(DB):
|
|
|
133
253
|
key, title, typ, authors, url, streams, pages, venue, year = cls.publis[i]
|
|
134
254
|
if venue is None:
|
|
135
255
|
venue = "unpublished"
|
|
136
|
-
return {
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
256
|
+
return {
|
|
257
|
+
"key": key,
|
|
258
|
+
"title": title,
|
|
259
|
+
"type": typ,
|
|
260
|
+
"authors": authors,
|
|
261
|
+
"url": url,
|
|
262
|
+
"streams": streams,
|
|
263
|
+
"pages": pages,
|
|
264
|
+
"venue": venue,
|
|
265
|
+
"year": year,
|
|
266
|
+
}
|
|
140
267
|
|
|
141
268
|
@classmethod
|
|
142
269
|
def author_publications(cls, key):
|
|
@@ -157,10 +284,16 @@ class LDB(DB):
|
|
|
157
284
|
|
|
158
285
|
@classmethod
|
|
159
286
|
@lru_cache(maxsize=1000)
|
|
160
|
-
def search_author(cls, name
|
|
287
|
+
def search_author(cls, name):
|
|
161
288
|
cls._ensure_loaded()
|
|
162
|
-
res = cls.search_engine.extract(
|
|
163
|
-
|
|
289
|
+
res = cls.search_engine.extract(
|
|
290
|
+
normalized_name(name),
|
|
291
|
+
limit=cls.parameters.search.limit,
|
|
292
|
+
)
|
|
293
|
+
if not res:
|
|
294
|
+
return []
|
|
295
|
+
target = max(cls.parameters.search.cutoff, res[0][1] - cls.parameters.search.slack)
|
|
296
|
+
res = [r[0] for r in res if r[1] > target]
|
|
164
297
|
sorted_ids = {i: cls.author_by_index(i) for i in sorted(res)}
|
|
165
298
|
return [sorted_ids[i] for i in res]
|
|
166
299
|
|
|
@@ -194,10 +327,11 @@ class LDB(DB):
|
|
|
194
327
|
:class:`RuntimeError`
|
|
195
328
|
If release not found or API request fails.
|
|
196
329
|
"""
|
|
330
|
+
api_url = cls.parameters.io.gh_api
|
|
197
331
|
if tag is None:
|
|
198
|
-
url = f"{
|
|
332
|
+
url = f"{api_url}/latest"
|
|
199
333
|
else:
|
|
200
|
-
url = f"{
|
|
334
|
+
url = f"{api_url}/tags/{tag}"
|
|
201
335
|
|
|
202
336
|
try:
|
|
203
337
|
response = requests.get(url, timeout=30)
|
|
@@ -229,15 +363,18 @@ class LDB(DB):
|
|
|
229
363
|
response = requests.get(url, stream=True, timeout=30)
|
|
230
364
|
response.raise_for_status()
|
|
231
365
|
|
|
232
|
-
total_size = int(response.headers.get(
|
|
233
|
-
|
|
234
|
-
with
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
366
|
+
total_size = int(response.headers.get("content-length", 0))
|
|
367
|
+
|
|
368
|
+
with (
|
|
369
|
+
open(dest, "wb") as f,
|
|
370
|
+
tqdm(
|
|
371
|
+
desc=desc,
|
|
372
|
+
total=total_size,
|
|
373
|
+
unit="B",
|
|
374
|
+
unit_scale=True,
|
|
375
|
+
unit_divisor=1024,
|
|
376
|
+
) as pbar,
|
|
377
|
+
):
|
|
241
378
|
for chunk in response.iter_content(chunk_size=8192):
|
|
242
379
|
if chunk:
|
|
243
380
|
f.write(chunk)
|
|
@@ -252,17 +389,19 @@ class LDB(DB):
|
|
|
252
389
|
"size": size,
|
|
253
390
|
"downloaded_at": datetime.now(timezone.utc).isoformat(),
|
|
254
391
|
}
|
|
255
|
-
|
|
256
|
-
|
|
392
|
+
meta_path = cls.parameters.io.metadata
|
|
393
|
+
meta_path.parent.mkdir(parents=True, exist_ok=True)
|
|
394
|
+
with open(meta_path, "w") as f:
|
|
257
395
|
json.dump(meta, f, indent=2)
|
|
258
396
|
|
|
259
397
|
@classmethod
|
|
260
398
|
def _load_meta(cls) -> dict | None:
|
|
261
399
|
"""Load version metadata from JSON file."""
|
|
262
|
-
|
|
400
|
+
meta_path = cls.parameters.io.metadata
|
|
401
|
+
if not meta_path.exists():
|
|
263
402
|
return None
|
|
264
403
|
try:
|
|
265
|
-
with open(
|
|
404
|
+
with open(meta_path, "r") as f:
|
|
266
405
|
return json.load(f)
|
|
267
406
|
except (json.JSONDecodeError, IOError):
|
|
268
407
|
return None
|
|
@@ -282,9 +421,18 @@ class LDB(DB):
|
|
|
282
421
|
|
|
283
422
|
Examples
|
|
284
423
|
--------
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
424
|
+
|
|
425
|
+
The following will get you a LDB if you do not have one.
|
|
426
|
+
|
|
427
|
+
>>> LDB.retrieve() # Latest release (freshest data)
|
|
428
|
+
>>> LDB.retrieve("v0.4.0") # Specific version
|
|
429
|
+
>>> LDB.retrieve("0.4.0") # Also works without 'v' prefix
|
|
430
|
+
|
|
431
|
+
Of course, the tag/version must be LDB-ready.
|
|
432
|
+
>>> LDB.retrieve("v0.3.0") # Too old for LDB
|
|
433
|
+
Traceback (most recent call last):
|
|
434
|
+
...
|
|
435
|
+
RuntimeError: Asset 'ldb.pkl.zst' not found in release v0.3.0. Available assets: []
|
|
288
436
|
|
|
289
437
|
Raises
|
|
290
438
|
------
|
|
@@ -301,43 +449,48 @@ class LDB(DB):
|
|
|
301
449
|
release_info = cls._get_release_info(tag)
|
|
302
450
|
release_tag = release_info["tag_name"]
|
|
303
451
|
|
|
452
|
+
destination = cls.parameters.io.destination
|
|
453
|
+
|
|
304
454
|
# Check if already installed (unless force=True)
|
|
305
455
|
if not force:
|
|
306
456
|
meta = cls._load_meta()
|
|
307
|
-
if meta and meta.get("tag") == release_tag and
|
|
308
|
-
logger.info(
|
|
457
|
+
if meta and meta.get("tag") == release_tag and destination.exists():
|
|
458
|
+
logger.info(
|
|
459
|
+
f"LDB version {release_tag} already installed. Use force=True to re-download."
|
|
460
|
+
)
|
|
309
461
|
return
|
|
310
462
|
|
|
311
463
|
# Find ldb.pkl.zst asset in release
|
|
312
464
|
assets = release_info.get("assets", [])
|
|
313
465
|
ldb_asset = None
|
|
314
466
|
for asset in assets:
|
|
315
|
-
if asset["name"] ==
|
|
467
|
+
if asset["name"] == destination.name:
|
|
316
468
|
ldb_asset = asset
|
|
317
469
|
break
|
|
318
470
|
|
|
319
471
|
if ldb_asset is None:
|
|
320
472
|
raise RuntimeError(
|
|
321
|
-
f"Asset '{
|
|
473
|
+
f"Asset '{destination.name}' not found in release {release_tag}. "
|
|
322
474
|
f"Available assets: {[a['name'] for a in assets]}"
|
|
323
475
|
)
|
|
324
476
|
|
|
325
477
|
download_url = ldb_asset["browser_download_url"]
|
|
326
478
|
asset_size = ldb_asset["size"]
|
|
327
479
|
|
|
328
|
-
logger.info(
|
|
480
|
+
logger.info(
|
|
481
|
+
f"Downloading LDB from release {release_tag} ({asset_size / 1e9:.2f} GB)"
|
|
482
|
+
)
|
|
329
483
|
|
|
330
484
|
# Download with progress bar
|
|
331
|
-
cls._download_file(download_url,
|
|
485
|
+
cls._download_file(download_url, destination, desc=f"LDB {release_tag}")
|
|
332
486
|
|
|
333
487
|
# Save version metadata
|
|
334
488
|
cls._save_meta(release_tag, download_url, asset_size)
|
|
335
489
|
|
|
336
|
-
#
|
|
337
|
-
cls.
|
|
338
|
-
cls._invalidate_cache()
|
|
490
|
+
# Load database and rebuild search engine locally
|
|
491
|
+
cls.load_db(restore_search=True)
|
|
339
492
|
|
|
340
|
-
logger.info(f"LDB {release_tag} successfully installed to {
|
|
493
|
+
logger.info(f"LDB {release_tag} successfully installed to {destination}")
|
|
341
494
|
|
|
342
495
|
@classmethod
|
|
343
496
|
def db_info(cls) -> dict | None:
|
|
@@ -350,14 +503,15 @@ class LDB(DB):
|
|
|
350
503
|
Dictionary with tag, date, size, path; or None if not installed.
|
|
351
504
|
"""
|
|
352
505
|
meta = cls._load_meta()
|
|
353
|
-
|
|
506
|
+
destination = cls.parameters.io.destination
|
|
507
|
+
if meta is None or not destination.exists():
|
|
354
508
|
return None
|
|
355
509
|
|
|
356
510
|
return {
|
|
357
511
|
"tag": meta.get("tag"),
|
|
358
512
|
"downloaded_at": meta.get("downloaded_at"),
|
|
359
513
|
"size": meta.get("size"),
|
|
360
|
-
"path": str(
|
|
514
|
+
"path": str(destination),
|
|
361
515
|
}
|
|
362
516
|
|
|
363
517
|
@classmethod
|
|
@@ -391,25 +545,27 @@ class LDB(DB):
|
|
|
391
545
|
return None
|
|
392
546
|
|
|
393
547
|
@classmethod
|
|
394
|
-
def dump(cls, filename: str, path=".", overwrite=False):
|
|
548
|
+
def dump(cls, filename: str, path=".", overwrite=False, include_search=True):
|
|
395
549
|
"""Save class state to file."""
|
|
396
550
|
# Convert numba dict to regular dict for pickling
|
|
397
551
|
nb_dict = None
|
|
398
|
-
if cls.search_engine is not None:
|
|
552
|
+
if include_search and cls.search_engine is not None:
|
|
399
553
|
nb_dict = cls.search_engine.vectorizer.features_
|
|
400
554
|
cls.search_engine.vectorizer.features_ = dict(nb_dict)
|
|
401
555
|
|
|
402
556
|
state = {
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
557
|
+
"authors": cls.authors,
|
|
558
|
+
"publis": cls.publis,
|
|
559
|
+
"keys": cls.keys,
|
|
560
|
+
"search_engine": cls.search_engine if include_search else None,
|
|
407
561
|
}
|
|
408
562
|
|
|
409
563
|
# Use safe_write pattern from gismo.common
|
|
410
|
-
destination = Path(path) /
|
|
564
|
+
destination = Path(path) / filename
|
|
411
565
|
if destination.exists() and not overwrite:
|
|
412
|
-
print(
|
|
566
|
+
print(
|
|
567
|
+
f"File {destination} already exists! Use overwrite option to overwrite."
|
|
568
|
+
)
|
|
413
569
|
else:
|
|
414
570
|
with safe_write(destination) as f:
|
|
415
571
|
cctx = zstd.ZstdCompressor(level=3)
|
|
@@ -417,15 +573,13 @@ class LDB(DB):
|
|
|
417
573
|
pickle.dump(state, z, protocol=5)
|
|
418
574
|
|
|
419
575
|
# Restore numba dict
|
|
420
|
-
if cls.search_engine is not None:
|
|
576
|
+
if include_search and cls.search_engine is not None:
|
|
421
577
|
cls.search_engine.vectorizer.features_ = nb_dict
|
|
422
578
|
|
|
423
579
|
@classmethod
|
|
424
|
-
def load(cls, filename: str, path="."):
|
|
580
|
+
def load(cls, filename: str, path=".", restore_search=False):
|
|
425
581
|
"""Load class state from file."""
|
|
426
|
-
dest = Path(path) /
|
|
427
|
-
if not dest.exists():
|
|
428
|
-
dest = dest.with_suffix(".pkl")
|
|
582
|
+
dest = Path(path) / filename
|
|
429
583
|
if not dest.exists():
|
|
430
584
|
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dest)
|
|
431
585
|
|
|
@@ -433,12 +587,15 @@ class LDB(DB):
|
|
|
433
587
|
with open(dest, "rb") as f, dctx.stream_reader(f) as z:
|
|
434
588
|
state = pickle.load(z)
|
|
435
589
|
|
|
436
|
-
cls.authors = state[
|
|
437
|
-
cls.publis = state[
|
|
438
|
-
cls.keys = state[
|
|
439
|
-
cls.search_engine = state[
|
|
590
|
+
cls.authors = state["authors"]
|
|
591
|
+
cls.publis = state["publis"]
|
|
592
|
+
cls.keys = state["keys"]
|
|
593
|
+
cls.search_engine = state["search_engine"]
|
|
440
594
|
|
|
441
|
-
if
|
|
595
|
+
if restore_search:
|
|
596
|
+
cls._build_search_engine()
|
|
597
|
+
cls.dump(filename=filename, path=path, overwrite=True, include_search=True)
|
|
598
|
+
elif cls.search_engine is not None:
|
|
442
599
|
cls.search_engine.vectorizer.features_ = cls.numbify_dict(
|
|
443
600
|
cls.search_engine.vectorizer.features_
|
|
444
601
|
)
|
|
@@ -447,25 +604,48 @@ class LDB(DB):
|
|
|
447
604
|
cls._initialized = True
|
|
448
605
|
|
|
449
606
|
@classmethod
|
|
450
|
-
def dump_db(cls):
|
|
451
|
-
|
|
452
|
-
|
|
607
|
+
def dump_db(cls, include_search=True):
|
|
608
|
+
destination = cls.parameters.io.destination
|
|
609
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
610
|
+
cls.dump(
|
|
611
|
+
destination.name,
|
|
612
|
+
path=destination.parent,
|
|
613
|
+
overwrite=True,
|
|
614
|
+
include_search=include_search,
|
|
615
|
+
)
|
|
453
616
|
|
|
454
617
|
@classmethod
|
|
455
|
-
def load_db(cls):
|
|
618
|
+
def load_db(cls, restore_search=False):
|
|
619
|
+
destination = cls.parameters.io.destination
|
|
456
620
|
try:
|
|
457
|
-
cls.load(
|
|
621
|
+
cls.load(
|
|
622
|
+
destination.name, path=destination.parent, restore_search=restore_search
|
|
623
|
+
)
|
|
458
624
|
except FileNotFoundError:
|
|
459
|
-
logger.warning("No LDB
|
|
625
|
+
logger.warning("No LDB found. Building from source...")
|
|
626
|
+
cls.build_db()
|
|
627
|
+
cls.dump_db()
|
|
628
|
+
except TypeError as e:
|
|
629
|
+
if "code expected at most" in str(e):
|
|
630
|
+
logger.warning(
|
|
631
|
+
"LDB file incompatible with this Python version. Rebuilding from source..."
|
|
632
|
+
)
|
|
633
|
+
cls.build_db()
|
|
634
|
+
cls.dump_db()
|
|
635
|
+
else:
|
|
636
|
+
raise
|
|
460
637
|
|
|
461
|
-
@
|
|
462
|
-
def delete_db():
|
|
463
|
-
|
|
464
|
-
|
|
638
|
+
@classmethod
|
|
639
|
+
def delete_db(cls):
|
|
640
|
+
destination = cls.parameters.io.destination
|
|
641
|
+
if destination.exists():
|
|
642
|
+
destination.unlink()
|
|
465
643
|
|
|
466
644
|
@staticmethod
|
|
467
645
|
def numbify_dict(input_dict):
|
|
468
|
-
nb_dict = nb.typed.Dict.empty(
|
|
646
|
+
nb_dict = nb.typed.Dict.empty(
|
|
647
|
+
key_type=nb.types.unicode_type, value_type=nb.types.int64
|
|
648
|
+
)
|
|
469
649
|
for k, v in input_dict.items():
|
|
470
650
|
nb_dict[k] = v
|
|
471
651
|
return nb_dict
|
|
@@ -473,6 +653,21 @@ class LDB(DB):
|
|
|
473
653
|
|
|
474
654
|
@dataclass(repr=False)
|
|
475
655
|
class LDBAuthor(Author, LDB):
|
|
656
|
+
"""
|
|
657
|
+
Author from the LDB (Local DBLP) database.
|
|
658
|
+
|
|
659
|
+
LDB provides local access to DBLP data without rate limiting.
|
|
660
|
+
|
|
661
|
+
Parameters
|
|
662
|
+
----------
|
|
663
|
+
name: :class:`str`
|
|
664
|
+
The author's name.
|
|
665
|
+
key: :class:`str`
|
|
666
|
+
DBLP person identifier (pid).
|
|
667
|
+
aliases: :class:`list`
|
|
668
|
+
Alternative names for the author.
|
|
669
|
+
"""
|
|
670
|
+
|
|
476
671
|
key: str
|
|
477
672
|
aliases: list = field(default_factory=list)
|
|
478
673
|
|
|
@@ -484,9 +679,29 @@ class LDBAuthor(Author, LDB):
|
|
|
484
679
|
return LDB.from_author(self)
|
|
485
680
|
|
|
486
681
|
|
|
487
|
-
|
|
488
682
|
@dataclass(repr=False)
|
|
489
683
|
class LDBPublication(Publication, LDB):
|
|
684
|
+
"""
|
|
685
|
+
Publication from the LDB (Local DBLP) database.
|
|
686
|
+
|
|
687
|
+
Parameters
|
|
688
|
+
----------
|
|
689
|
+
title: :class:`str`
|
|
690
|
+
Publication title.
|
|
691
|
+
authors: :class:`list`
|
|
692
|
+
List of :class:`LDBAuthor` objects.
|
|
693
|
+
venue: :class:`str`
|
|
694
|
+
Publication venue.
|
|
695
|
+
type: :class:`str`
|
|
696
|
+
Publication type.
|
|
697
|
+
year: :class:`int`
|
|
698
|
+
Publication year.
|
|
699
|
+
key: :class:`str`
|
|
700
|
+
DBLP record key.
|
|
701
|
+
metadata: :class:`dict`
|
|
702
|
+
Additional metadata (URL, streams, pages).
|
|
703
|
+
"""
|
|
704
|
+
|
|
490
705
|
key: str
|
|
491
706
|
metadata: dict = field(default_factory=dict)
|
|
492
707
|
|
|
@@ -497,5 +712,5 @@ class LDBPublication(Publication, LDB):
|
|
|
497
712
|
@property
|
|
498
713
|
def stream(self):
|
|
499
714
|
if "streams" in self.metadata:
|
|
500
|
-
return f
|
|
715
|
+
return f"https://dblp.org/streams/{self.metadata['streams'][0]}"
|
|
501
716
|
return None
|