gismap 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gismap/__init__.py +1 -0
- gismap/build.py +4 -0
- gismap/gisgraphs/widget.py +5 -5
- gismap/lab/lab_author.py +4 -7
- gismap/lab/labmap.py +3 -4
- gismap/lab_examples/lamsade.py +43 -0
- gismap/sources/dblp.py +1 -1
- gismap/sources/dblp_ttl.py +168 -0
- gismap/sources/hal.py +2 -2
- gismap/sources/ldb.py +501 -0
- gismap/sources/multi.py +2 -2
- gismap/utils/common.py +15 -10
- gismap/utils/logger.py +2 -0
- gismap/utils/requests.py +3 -1
- gismap/utils/zlist.py +68 -0
- {gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/METADATA +18 -5
- {gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/RECORD +19 -14
- {gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/WHEEL +1 -1
- {gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/licenses/AUTHORS.md +0 -0
gismap/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from importlib.metadata import metadata
|
|
|
4
4
|
|
|
5
5
|
from gismap.sources.hal import HAL as HAL, HALAuthor as HALAuthor
|
|
6
6
|
from gismap.sources.dblp import DBLP as DBLP, DBLPAuthor as DBLPAuthor
|
|
7
|
+
from gismap.sources.ldb import LDB as LDB, LDBAuthor as LDBAuthor
|
|
7
8
|
from gismap.utils.common import get_classes as get_classes
|
|
8
9
|
from gismap.gismo import make_gismo as make_gismo
|
|
9
10
|
from gismap.search import (
|
gismap/build.py
ADDED
gismap/gisgraphs/widget.py
CHANGED
|
@@ -33,7 +33,7 @@ def safe_filename(name):
|
|
|
33
33
|
return f"gismap-{safe_str[:60]}.html"
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
place_holder = "Diego Perino, The-Dang Huynh, François Durand (hal: fradurand,
|
|
36
|
+
place_holder = "Diego Perino, The-Dang Huynh, François Durand (hal: fradurand, ldb: 38/11269), Rim Kaddah, Leonardo Linguaglossa, Céline Comte"
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class GismapWidget:
|
|
@@ -66,7 +66,7 @@ class GismapWidget:
|
|
|
66
66
|
layout=widgets.Layout(width="50%", height="100px"),
|
|
67
67
|
)
|
|
68
68
|
self.dbs = widgets.RadioButtons(
|
|
69
|
-
options=["HAL", "
|
|
69
|
+
options=["HAL", "LDB", "Both"],
|
|
70
70
|
description="DB(s):",
|
|
71
71
|
layout=widgets.Layout(width="80px", max_width="20%"),
|
|
72
72
|
)
|
|
@@ -100,9 +100,9 @@ class GismapWidget:
|
|
|
100
100
|
dbs = (
|
|
101
101
|
"hal"
|
|
102
102
|
if self.dbs.value == "HAL"
|
|
103
|
-
else "
|
|
104
|
-
if self.dbs.value == "
|
|
105
|
-
else ["hal", "
|
|
103
|
+
else "ldb"
|
|
104
|
+
if self.dbs.value == "LDB"
|
|
105
|
+
else ["hal", "ldb"]
|
|
106
106
|
)
|
|
107
107
|
name = self.names.value
|
|
108
108
|
pattern = r",\s*(?![^()]*\))"
|
gismap/lab/lab_author.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
import re
|
|
3
3
|
|
|
4
|
-
from gismap import get_classes
|
|
4
|
+
from gismap import get_classes
|
|
5
5
|
from gismap.sources.models import DB, db_class_to_auth_class
|
|
6
6
|
from gismap.sources.multi import SourcedAuthor, sort_author_sources
|
|
7
7
|
from gismap.utils.common import LazyRepr, list_of_objects
|
|
8
8
|
from gismap.utils.logger import logger
|
|
9
9
|
|
|
10
10
|
db_dict = get_classes(DB, key="db_name")
|
|
11
|
-
default_dbs = [
|
|
11
|
+
default_dbs = ["hal", "ldb"]
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
@dataclass(repr=False)
|
|
@@ -27,8 +27,6 @@ class AuthorMetadata(LazyRepr):
|
|
|
27
27
|
Group of the author.
|
|
28
28
|
position: :class:`tuple`
|
|
29
29
|
Coordinates of the author.
|
|
30
|
-
keys: :class:`dict`
|
|
31
|
-
Some DB key values of the author.
|
|
32
30
|
"""
|
|
33
31
|
|
|
34
32
|
url: str = None
|
|
@@ -46,12 +44,11 @@ class LabAuthor(SourcedAuthor):
|
|
|
46
44
|
|
|
47
45
|
Improper key/values are ignored (with a warning).
|
|
48
46
|
|
|
49
|
-
|
|
50
|
-
>>> dummy= LabAuthor("My Name(img: https://my.url.img, group:me,url:https://mysite.org,hal:key1,dblp:toto,badkey:hello,no_colon_separator)")
|
|
47
|
+
>>> dummy= LabAuthor("My Name(img: https://my.url.img, group:me,url:https://mysite.org,hal:key1,ldb:toto,badkey:hello,no_colon_separator)")
|
|
51
48
|
>>> dummy.metadata
|
|
52
49
|
AuthorMetadata(url='https://mysite.org', img='https://my.url.img', group='me')
|
|
53
50
|
>>> dummy.sources
|
|
54
|
-
[HALAuthor(name='My Name', key='key1'),
|
|
51
|
+
[HALAuthor(name='My Name', key='key1'), LDBAuthor(name='My Name', key='toto')]
|
|
55
52
|
|
|
56
53
|
You can enter multiple keys for the same DB. HAL key types are automatically detected.
|
|
57
54
|
|
gismap/lab/labmap.py
CHANGED
|
@@ -38,7 +38,7 @@ class LabMap(MixInIO):
|
|
|
38
38
|
----------
|
|
39
39
|
name: :class:`str`
|
|
40
40
|
Name of the lab. Can be set as class or instance attribute.
|
|
41
|
-
dbs: :class:`list`, default=[:class:`~gismap.sources.hal.HAL`, :class:`~gismap.sources.
|
|
41
|
+
dbs: :class:`list`, default=[:class:`~gismap.sources.hal.HAL`, :class:`~gismap.sources.ldb.LDB`]
|
|
42
42
|
List of DB sources to use.
|
|
43
43
|
|
|
44
44
|
|
|
@@ -57,8 +57,7 @@ class LabMap(MixInIO):
|
|
|
57
57
|
def __init__(self, name=None, dbs=None):
|
|
58
58
|
if name is not None:
|
|
59
59
|
self.name = name
|
|
60
|
-
|
|
61
|
-
self.dbs = list_of_objects(dbs, db_dict, default=default_dbs)
|
|
60
|
+
self.dbs = dbs
|
|
62
61
|
self.author_selectors = [author_taboo_filter()]
|
|
63
62
|
self.publication_selectors = [
|
|
64
63
|
publication_size_filter(),
|
|
@@ -92,7 +91,7 @@ class LabMap(MixInIO):
|
|
|
92
91
|
if not all(f(author) for f in self.author_selectors):
|
|
93
92
|
continue
|
|
94
93
|
if len(author.sources) == 0:
|
|
95
|
-
author.auto_sources(dbs=self.dbs)
|
|
94
|
+
author.auto_sources(dbs=list_of_objects(self.dbs, db_dict, default=default_dbs))
|
|
96
95
|
if author.sources:
|
|
97
96
|
self.authors[author.key] = author
|
|
98
97
|
if author.metadata.img is None:
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from bs4 import BeautifulSoup as Soup
|
|
2
|
+
|
|
3
|
+
from gismap.lab import LabAuthor
|
|
4
|
+
from gismap.lab.lab_author import AuthorMetadata
|
|
5
|
+
from gismap.lab.labmap import LabMap
|
|
6
|
+
from gismap.utils.requests import get
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def lamsade_parse(div):
|
|
10
|
+
"""
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
div: :class:`~bs4.BeautifulSoup`
|
|
14
|
+
Soup of the div of one researcher
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
:class:`tuple`
|
|
19
|
+
name, image url (or None), webpage (or None)
|
|
20
|
+
"""
|
|
21
|
+
img = div.img['src'] if div.img else None
|
|
22
|
+
url = div.a['href'] if div.a else None
|
|
23
|
+
name = div.h2.text.strip().title()
|
|
24
|
+
name = " ".join(name.split(" ", 1)[::-1])
|
|
25
|
+
return name, img, url
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Lamsade(LabMap):
|
|
29
|
+
"""
|
|
30
|
+
Class for handling the Lamsade team (Dauphine).
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
name = "Lamsade"
|
|
34
|
+
base_url = "https://www.lamsade.dauphine.fr/"
|
|
35
|
+
directory = "fr/personnes/enseignants-chercheurs-et-chercheurs.html"
|
|
36
|
+
|
|
37
|
+
def _author_iterator(self):
|
|
38
|
+
soup = Soup(get(self.base_url+self.directory), features="lxml")
|
|
39
|
+
for a in soup('div', class_="dauphinecv-item"):
|
|
40
|
+
name, img, url = lamsade_parse(a)
|
|
41
|
+
img = self.base_url+img if img else None
|
|
42
|
+
url = self.base_url+url if url else None
|
|
43
|
+
yield LabAuthor(name=name, metadata=AuthorMetadata(url=url, img=img, group=self.name))
|
gismap/sources/dblp.py
CHANGED
|
@@ -13,7 +13,7 @@ from gismap.utils.requests import get
|
|
|
13
13
|
class DBLP(DB):
|
|
14
14
|
db_name: ClassVar[str] = "dblp"
|
|
15
15
|
author_backoff: ClassVar[float] = 5.0
|
|
16
|
-
publi_backoff: ClassVar[float] =
|
|
16
|
+
publi_backoff: ClassVar[float] = 5.0
|
|
17
17
|
|
|
18
18
|
@classmethod
|
|
19
19
|
def search_author(cls, name, wait=True):
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import zlib
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
|
|
8
|
+
from gismap.utils.requests import session
|
|
9
|
+
from gismap.sources.dblp import DBLP_TYPES
|
|
10
|
+
|
|
11
|
+
key_re = r'<https://dblp.org/rec/([^>]+)>'
|
|
12
|
+
title_re = r'.*?dblp:title\s+"([^"]+)"'
|
|
13
|
+
type_re = r'.*?dblp:bibtexType\s+bibtex:(\w+)'
|
|
14
|
+
authors_re = r'.*?dblp:hasSignature\s+(\[.*\])\s*;'
|
|
15
|
+
url_re = r'(?:.*?dblp:primaryDocumentPage <([^>]+)>)?'
|
|
16
|
+
stream_re = r'(?:.*?dblp:publishedInStream ([^;]+) ;)?'
|
|
17
|
+
pages_re = r'(?:.*?dblp:pagination "([^"]+)")?'
|
|
18
|
+
venue_re = r'(?:.*?dblp:publishedIn\s+"([^"]+?)")?'
|
|
19
|
+
year_re = r'.*?"(\d{4})"\^\^<http://www.w3.org/2001/XMLSchema#gYear>'
|
|
20
|
+
|
|
21
|
+
pub_re = re.compile("".join([key_re, title_re, type_re, authors_re,
|
|
22
|
+
url_re, stream_re, pages_re, venue_re, year_re]), flags=re.S)
|
|
23
|
+
|
|
24
|
+
streams_re = re.compile(r'<https://dblp.org/streams/((?:conf|journals)/[^>]+)>')
|
|
25
|
+
|
|
26
|
+
authid_re = re.compile(
|
|
27
|
+
r'\[.*?signatureDblpName\s*?"([^"]+?)(?:\s+\d+)?".*?signatureCreator\s*<https://dblp.org/pid/([^>]+?)>.*?]',
|
|
28
|
+
flags=re.S)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_block(dblp_block):
|
|
32
|
+
"""
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
dblp_block: :class:`str`
|
|
36
|
+
A DBLP publication, turtle format.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
key: :class:`str`
|
|
41
|
+
DBLP key.
|
|
42
|
+
title: :class:`str`
|
|
43
|
+
Publication title.
|
|
44
|
+
type: :class:`str`
|
|
45
|
+
Type of publication.
|
|
46
|
+
authors: :class:`dict`
|
|
47
|
+
Publication authors (key -> name)
|
|
48
|
+
url: :class:`str` or :class:`NoneType`
|
|
49
|
+
Publication URL.
|
|
50
|
+
stream: :class:`list` or :class:`NoneType`
|
|
51
|
+
Publication streams (normalized journal/conf).
|
|
52
|
+
pages: :class:`str` or :class:`NoneType`
|
|
53
|
+
Publication pages.
|
|
54
|
+
venue: :class:`str` or :class:`NoneType`
|
|
55
|
+
Publication venue (conf/journal).
|
|
56
|
+
year: :class:`int`
|
|
57
|
+
Year of publication.
|
|
58
|
+
"""
|
|
59
|
+
items = pub_re.search(dblp_block)
|
|
60
|
+
if items is None:
|
|
61
|
+
return None
|
|
62
|
+
key, title, typ, authors, url, stream, pages, venue, year = items.groups()
|
|
63
|
+
typ = typ.lower()
|
|
64
|
+
typ = DBLP_TYPES.get(typ, typ)
|
|
65
|
+
if stream:
|
|
66
|
+
stream = streams_re.findall(stream)
|
|
67
|
+
authors = {i: n for n, i in authid_re.findall(authors)}
|
|
68
|
+
if authors:
|
|
69
|
+
return key, title, typ, authors, url, stream, pages, venue, int(year)
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@contextmanager
|
|
74
|
+
def get_stream(source, chunk_size=1024 * 64):
|
|
75
|
+
"""
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
source: :class:`str` or :class:`~pathlib.Path`
|
|
79
|
+
Where the content. Can be on a local file or on the Internet.
|
|
80
|
+
chunk_size: :class:`int`, optional
|
|
81
|
+
Desired chunk size. For streaming gz content, must be a multiple of 32kB.
|
|
82
|
+
|
|
83
|
+
Yields
|
|
84
|
+
-------
|
|
85
|
+
iterable
|
|
86
|
+
Chunk iterator that streams the content.
|
|
87
|
+
:class:`int`
|
|
88
|
+
Source size (used later to compute ETA).
|
|
89
|
+
"""
|
|
90
|
+
if isinstance(source, str) and source.startswith("https://"):
|
|
91
|
+
# URL HTTP
|
|
92
|
+
with session.get(source, stream=True) as r:
|
|
93
|
+
r.raise_for_status()
|
|
94
|
+
total = int(r.headers.get("content-length", 0)) or None
|
|
95
|
+
yield r.iter_content(chunk_size=chunk_size), total
|
|
96
|
+
else:
|
|
97
|
+
source = Path(source)
|
|
98
|
+
if not source.exists():
|
|
99
|
+
yield [], 0
|
|
100
|
+
return None
|
|
101
|
+
total = source.stat().st_size
|
|
102
|
+
with source.open("rb") as file_handle:
|
|
103
|
+
def read_chunks():
|
|
104
|
+
while True:
|
|
105
|
+
chunk = file_handle.read(chunk_size)
|
|
106
|
+
if not chunk:
|
|
107
|
+
break
|
|
108
|
+
yield chunk
|
|
109
|
+
yield read_chunks(), total
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def publis_streamer(source, chunk_size=1024 * 64, encoding="unicode_escape"):
|
|
113
|
+
"""
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
source: :class:`str` or :class:`~pathlib.Path`
|
|
117
|
+
Where the DBLP turtle content is. Can be on a local file or on the Internet.
|
|
118
|
+
chunk_size: :class:`int`, optional
|
|
119
|
+
Desired chunk size. Must be a multiple of 32kB.
|
|
120
|
+
encoding: :class:`str`, default=unicode_escape
|
|
121
|
+
Encoding of stream.
|
|
122
|
+
|
|
123
|
+
Yields
|
|
124
|
+
-------
|
|
125
|
+
key: :class:`str`
|
|
126
|
+
DBLP key.
|
|
127
|
+
title: :class:`str`
|
|
128
|
+
Publication title.
|
|
129
|
+
type: :class:`str`
|
|
130
|
+
Type of publication.
|
|
131
|
+
authors: :class:`dict`
|
|
132
|
+
Publication authors (key -> name).
|
|
133
|
+
venue: :class:`str`
|
|
134
|
+
Publication venue (conf/journal).
|
|
135
|
+
year: :class:`int`
|
|
136
|
+
Year of publication.
|
|
137
|
+
"""
|
|
138
|
+
with get_stream(source, chunk_size=chunk_size) as (stream, total):
|
|
139
|
+
with tqdm(total=total, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing") as pbar:
|
|
140
|
+
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
141
|
+
text_buffer = ""
|
|
142
|
+
for chunk in stream:
|
|
143
|
+
if not chunk:
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
pbar.update(len(chunk))
|
|
147
|
+
data = decomp.decompress(chunk)
|
|
148
|
+
if not data:
|
|
149
|
+
continue
|
|
150
|
+
text_buffer += data.decode(encoding, errors="replace")
|
|
151
|
+
|
|
152
|
+
blocks = text_buffer.split("\n\n")
|
|
153
|
+
text_buffer = blocks[-1]
|
|
154
|
+
for block in blocks[:-1]:
|
|
155
|
+
pub = parse_block(block)
|
|
156
|
+
if pub:
|
|
157
|
+
yield pub
|
|
158
|
+
|
|
159
|
+
data = decomp.flush()
|
|
160
|
+
if data:
|
|
161
|
+
text_buffer += data.decode(encoding, errors="replace")
|
|
162
|
+
|
|
163
|
+
if text_buffer:
|
|
164
|
+
blocks = text_buffer.split("\n\n")
|
|
165
|
+
for block in blocks:
|
|
166
|
+
pub = parse_block(block)
|
|
167
|
+
if pub:
|
|
168
|
+
yield pub
|
gismap/sources/hal.py
CHANGED
|
@@ -121,8 +121,8 @@ class HAL(DB):
|
|
|
121
121
|
>>> diego = publications[2].authors[3]
|
|
122
122
|
>>> diego
|
|
123
123
|
HALAuthor(name='Diego Perino', key='Diego Perino', key_type='fullname')
|
|
124
|
-
>>> len(diego.get_publications())
|
|
125
|
-
|
|
124
|
+
>>> len(diego.get_publications()) > 28
|
|
125
|
+
True
|
|
126
126
|
>>> publications[-7] # doctest: +NORMALIZE_WHITESPACE
|
|
127
127
|
HALPublication(title='Upper bounds for stabilization in acyclic preference-based systems',
|
|
128
128
|
authors=[HALAuthor(name='Fabien Mathieu', key='fabien-mathieu')],
|
gismap/sources/ldb.py
ADDED
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from typing import ClassVar
|
|
4
|
+
from platformdirs import user_data_dir
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
import errno
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
import zstandard as zstd
|
|
12
|
+
import dill as pickle
|
|
13
|
+
import numpy as np
|
|
14
|
+
import numba as nb
|
|
15
|
+
from bof.fuzz import Process
|
|
16
|
+
from gismo.common import safe_write
|
|
17
|
+
from tqdm.auto import tqdm
|
|
18
|
+
import requests
|
|
19
|
+
|
|
20
|
+
from gismap.sources.dblp_ttl import publis_streamer
|
|
21
|
+
from gismap.sources.models import DB, Author, Publication
|
|
22
|
+
from gismap.utils.logger import logger
|
|
23
|
+
from gismap.utils.text import asciify
|
|
24
|
+
from gismap.utils.zlist import ZList
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
DATA_DIR = Path(user_data_dir(
|
|
28
|
+
appname="gismap",
|
|
29
|
+
appauthor=False,
|
|
30
|
+
))
|
|
31
|
+
|
|
32
|
+
LDB_STEM = "ldb"
|
|
33
|
+
|
|
34
|
+
LDB_PATH = DATA_DIR / f"{LDB_STEM}.pkl.zst"
|
|
35
|
+
|
|
36
|
+
TTL_URL = "https://dblp.org/rdf/dblp.ttl.gz"
|
|
37
|
+
|
|
38
|
+
# GitHub release asset constants
|
|
39
|
+
GITHUB_REPO = "balouf/gismap"
|
|
40
|
+
GITHUB_API_URL = f"https://api.github.com/repos/{GITHUB_REPO}/releases"
|
|
41
|
+
LDB_ASSET_NAME = "ldb.pkl.zst"
|
|
42
|
+
LDB_META_PATH = DATA_DIR / "ldb_meta.json"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(repr=False)
|
|
46
|
+
class LDB(DB):
|
|
47
|
+
"""
|
|
48
|
+
Browse DBLP from a local copy of the database.
|
|
49
|
+
|
|
50
|
+
LDB is a class-only database - it should not be instantiated.
|
|
51
|
+
All methods are classmethods and state is stored in class variables.
|
|
52
|
+
"""
|
|
53
|
+
db_name: ClassVar[str] = LDB_STEM
|
|
54
|
+
source: ClassVar[str] = TTL_URL
|
|
55
|
+
|
|
56
|
+
# Class-level state (replaces instance attributes)
|
|
57
|
+
authors: ClassVar[ZList | None] = None
|
|
58
|
+
publis: ClassVar[ZList | None] = None
|
|
59
|
+
keys: ClassVar[dict | None] = None
|
|
60
|
+
search_engine: ClassVar[Process | None] = None
|
|
61
|
+
_initialized: ClassVar[bool] = False
|
|
62
|
+
|
|
63
|
+
__hash__ = object.__hash__
|
|
64
|
+
|
|
65
|
+
def __init__(self):
|
|
66
|
+
raise TypeError(
|
|
67
|
+
"LDB should not be instantiated. Use class methods directly, e.g., LDB.search_author(name)"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def _ensure_loaded(cls):
|
|
72
|
+
"""Lazy-load the database if not already loaded."""
|
|
73
|
+
if cls._initialized:
|
|
74
|
+
return
|
|
75
|
+
if LDB_PATH.exists():
|
|
76
|
+
cls.load_db()
|
|
77
|
+
else:
|
|
78
|
+
logger.info("LDB not found locally. Attempting to retrieve from GitHub...")
|
|
79
|
+
try:
|
|
80
|
+
cls.retrieve()
|
|
81
|
+
cls.load_db()
|
|
82
|
+
except RuntimeError as e:
|
|
83
|
+
logger.warning(f"Could not auto-retrieve LDB: {e}")
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def build_db(cls, source=None, limit=None, n_range=2, length_impact=.1, authors_frame=512, publis_frame=256):
|
|
87
|
+
if source is None:
|
|
88
|
+
source = cls.source
|
|
89
|
+
authors_dict = dict()
|
|
90
|
+
logger.info("Retrieve publications")
|
|
91
|
+
with ZList(frame_size=publis_frame) as publis:
|
|
92
|
+
for i, (key, title, typ, authors, url, streams, pages, venue, year) in enumerate(publis_streamer(source)):
|
|
93
|
+
auth_indices = []
|
|
94
|
+
for auth_key, auth_name in authors.items():
|
|
95
|
+
if auth_key not in authors_dict:
|
|
96
|
+
authors_dict[auth_key] = (len(authors_dict), auth_name, [i])
|
|
97
|
+
else:
|
|
98
|
+
authors_dict[auth_key][2].append(i)
|
|
99
|
+
auth_indices.append(authors_dict[auth_key][0])
|
|
100
|
+
publis.append((key, title, typ, auth_indices, url, streams, pages, venue, year))
|
|
101
|
+
if i == limit:
|
|
102
|
+
break
|
|
103
|
+
cls.publis = publis
|
|
104
|
+
logger.info(f"{len(publis)} publications retrieved.")
|
|
105
|
+
logger.info("Compact authors")
|
|
106
|
+
with ZList(frame_size=authors_frame) as authors:
|
|
107
|
+
for key, (_, name, pubs) in tqdm(authors_dict.items()):
|
|
108
|
+
authors.append((key, name, pubs))
|
|
109
|
+
cls.authors = authors
|
|
110
|
+
cls.keys = {k: v[0] for k, v in authors_dict.items()}
|
|
111
|
+
del authors_dict
|
|
112
|
+
cls.search_engine = Process(n_range=n_range, length_impact=length_impact)
|
|
113
|
+
cls.search_engine.fit([asciify(a[1]) for a in authors])
|
|
114
|
+
cls.search_engine.choices = np.arange(len(authors))
|
|
115
|
+
cls.search_engine.vectorizer.features_ = cls.numbify_dict(cls.search_engine.vectorizer.features_)
|
|
116
|
+
logger.info(f"{len(cls.authors)} compacted.")
|
|
117
|
+
cls._invalidate_cache()
|
|
118
|
+
cls._initialized = True
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
@lru_cache(maxsize=50000)
|
|
122
|
+
def author_by_index(cls, i):
|
|
123
|
+
key, name, _ = cls.authors[i]
|
|
124
|
+
return LDBAuthor(key=key, name=name)
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def author_by_key(cls, key):
|
|
128
|
+
return cls.author_by_index(cls.keys[key])
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
@lru_cache(maxsize=50000)
|
|
132
|
+
def publication_by_index(cls, i):
|
|
133
|
+
key, title, typ, authors, url, streams, pages, venue, year = cls.publis[i]
|
|
134
|
+
if venue is None:
|
|
135
|
+
venue = "unpublished"
|
|
136
|
+
return {"key": key, "title": title, "type": typ,
|
|
137
|
+
"authors": authors,
|
|
138
|
+
"url": url, "streams": streams, "pages": pages,
|
|
139
|
+
"venue": venue, "year": year}
|
|
140
|
+
|
|
141
|
+
@classmethod
|
|
142
|
+
def author_publications(cls, key):
|
|
143
|
+
cls._ensure_loaded()
|
|
144
|
+
_, name, pubs = cls.authors[cls.keys[key]]
|
|
145
|
+
pubs = [cls.publication_by_index(k).copy() for k in pubs]
|
|
146
|
+
auth_ids = sorted({k for p in pubs for k in p["authors"]})
|
|
147
|
+
auths = {k: cls.author_by_index(k) for k in auth_ids}
|
|
148
|
+
for pub in pubs:
|
|
149
|
+
pub["authors"] = [auths[k] for k in pub["authors"]]
|
|
150
|
+
metadata = dict()
|
|
151
|
+
for k in ["url", "streams", "pages"]:
|
|
152
|
+
v = pub.pop(k)
|
|
153
|
+
if v is not None:
|
|
154
|
+
metadata[k] = v
|
|
155
|
+
pub["metadata"] = metadata
|
|
156
|
+
return [LDBPublication(**pub) for pub in pubs]
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
@lru_cache(maxsize=1000)
|
|
160
|
+
def search_author(cls, name, limit=2, score_cutoff=40.0, slack=10.0):
|
|
161
|
+
cls._ensure_loaded()
|
|
162
|
+
res = cls.search_engine.extract(asciify(name), limit=limit, score_cutoff=score_cutoff)
|
|
163
|
+
res = [r[0] for r in res if r[1] > res[0][1] - slack]
|
|
164
|
+
sorted_ids = {i: cls.author_by_index(i) for i in sorted(res)}
|
|
165
|
+
return [sorted_ids[i] for i in res]
|
|
166
|
+
|
|
167
|
+
@classmethod
|
|
168
|
+
def _invalidate_cache(cls):
|
|
169
|
+
cls.search_author.cache_clear()
|
|
170
|
+
cls.publication_by_index.cache_clear()
|
|
171
|
+
cls.author_by_index.cache_clear()
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_author(cls, a):
|
|
175
|
+
return cls.author_publications(a.key)
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def _get_release_info(cls, tag: str | None = None) -> dict:
|
|
179
|
+
"""
|
|
180
|
+
Fetch release metadata from GitHub API.
|
|
181
|
+
|
|
182
|
+
Parameters
|
|
183
|
+
----------
|
|
184
|
+
tag: :class:`str`, optional
|
|
185
|
+
Specific release tag (e.g., "v0.4.0"). If None, fetches latest.
|
|
186
|
+
|
|
187
|
+
Returns
|
|
188
|
+
-------
|
|
189
|
+
:class:`dict`
|
|
190
|
+
Release metadata including tag_name and assets.
|
|
191
|
+
|
|
192
|
+
Raises
|
|
193
|
+
------
|
|
194
|
+
:class:`RuntimeError`
|
|
195
|
+
If release not found or API request fails.
|
|
196
|
+
"""
|
|
197
|
+
if tag is None:
|
|
198
|
+
url = f"{GITHUB_API_URL}/latest"
|
|
199
|
+
else:
|
|
200
|
+
url = f"{GITHUB_API_URL}/tags/{tag}"
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
response = requests.get(url, timeout=30)
|
|
204
|
+
response.raise_for_status()
|
|
205
|
+
return response.json()
|
|
206
|
+
except requests.exceptions.HTTPError as e:
|
|
207
|
+
if response.status_code == 404:
|
|
208
|
+
raise RuntimeError(f"Release not found: {tag or 'latest'}") from e
|
|
209
|
+
raise RuntimeError(f"GitHub API error: {e}") from e
|
|
210
|
+
except requests.exceptions.RequestException as e:
|
|
211
|
+
raise RuntimeError(f"Network error fetching release info: {e}") from e
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
def _download_file(cls, url: str, dest: Path, desc: str = "Downloading"):
|
|
215
|
+
"""
|
|
216
|
+
Download file with progress bar.
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
url : str
|
|
221
|
+
URL to download from.
|
|
222
|
+
dest : Path
|
|
223
|
+
Destination file path.
|
|
224
|
+
desc : str
|
|
225
|
+
Description for progress bar.
|
|
226
|
+
"""
|
|
227
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
228
|
+
|
|
229
|
+
response = requests.get(url, stream=True, timeout=30)
|
|
230
|
+
response.raise_for_status()
|
|
231
|
+
|
|
232
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
233
|
+
|
|
234
|
+
with open(dest, 'wb') as f, tqdm(
|
|
235
|
+
desc=desc,
|
|
236
|
+
total=total_size,
|
|
237
|
+
unit='B',
|
|
238
|
+
unit_scale=True,
|
|
239
|
+
unit_divisor=1024,
|
|
240
|
+
) as pbar:
|
|
241
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
242
|
+
if chunk:
|
|
243
|
+
f.write(chunk)
|
|
244
|
+
pbar.update(len(chunk))
|
|
245
|
+
|
|
246
|
+
@classmethod
|
|
247
|
+
def _save_meta(cls, tag: str, url: str, size: int):
|
|
248
|
+
"""Save version metadata to JSON file."""
|
|
249
|
+
meta = {
|
|
250
|
+
"tag": tag,
|
|
251
|
+
"url": url,
|
|
252
|
+
"size": size,
|
|
253
|
+
"downloaded_at": datetime.now(timezone.utc).isoformat(),
|
|
254
|
+
}
|
|
255
|
+
LDB_META_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
256
|
+
with open(LDB_META_PATH, 'w') as f:
|
|
257
|
+
json.dump(meta, f, indent=2)
|
|
258
|
+
|
|
259
|
+
@classmethod
|
|
260
|
+
def _load_meta(cls) -> dict | None:
|
|
261
|
+
"""Load version metadata from JSON file."""
|
|
262
|
+
if not LDB_META_PATH.exists():
|
|
263
|
+
return None
|
|
264
|
+
try:
|
|
265
|
+
with open(LDB_META_PATH, 'r') as f:
|
|
266
|
+
return json.load(f)
|
|
267
|
+
except (json.JSONDecodeError, IOError):
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def retrieve(cls, version: str | None = None, force: bool = False):
|
|
272
|
+
"""
|
|
273
|
+
Download LDB database from GitHub releases.
|
|
274
|
+
|
|
275
|
+
Parameters
|
|
276
|
+
----------
|
|
277
|
+
version: :class:`str`, optional
|
|
278
|
+
Specific release version (e.g., "v0.4.0" or "0.4.0").
|
|
279
|
+
If None, downloads from latest release.
|
|
280
|
+
force: :class:`bool`, default=False
|
|
281
|
+
Download even if same version is installed.
|
|
282
|
+
|
|
283
|
+
Examples
|
|
284
|
+
--------
|
|
285
|
+
>> LDB.retrieve() # Latest release (freshest data)
|
|
286
|
+
>> LDB.retrieve("v0.4.0") # Specific version
|
|
287
|
+
>> LDB.retrieve("0.4.0") # Also works without 'v' prefix
|
|
288
|
+
|
|
289
|
+
Raises
|
|
290
|
+
------
|
|
291
|
+
RuntimeError
|
|
292
|
+
If release or asset not found, or download fails.
|
|
293
|
+
"""
|
|
294
|
+
# Normalize version string (add "v" prefix if missing)
|
|
295
|
+
tag = None
|
|
296
|
+
if version is not None:
|
|
297
|
+
tag = version if version.startswith("v") else f"v{version}"
|
|
298
|
+
|
|
299
|
+
# Fetch release info
|
|
300
|
+
logger.info(f"Fetching release info for: {tag or 'latest'}")
|
|
301
|
+
release_info = cls._get_release_info(tag)
|
|
302
|
+
release_tag = release_info["tag_name"]
|
|
303
|
+
|
|
304
|
+
# Check if already installed (unless force=True)
|
|
305
|
+
if not force:
|
|
306
|
+
meta = cls._load_meta()
|
|
307
|
+
if meta and meta.get("tag") == release_tag and LDB_PATH.exists():
|
|
308
|
+
logger.info(f"LDB version {release_tag} already installed. Use force=True to re-download.")
|
|
309
|
+
return
|
|
310
|
+
|
|
311
|
+
# Find ldb.pkl.zst asset in release
|
|
312
|
+
assets = release_info.get("assets", [])
|
|
313
|
+
ldb_asset = None
|
|
314
|
+
for asset in assets:
|
|
315
|
+
if asset["name"] == LDB_ASSET_NAME:
|
|
316
|
+
ldb_asset = asset
|
|
317
|
+
break
|
|
318
|
+
|
|
319
|
+
if ldb_asset is None:
|
|
320
|
+
raise RuntimeError(
|
|
321
|
+
f"Asset '{LDB_ASSET_NAME}' not found in release {release_tag}. "
|
|
322
|
+
f"Available assets: {[a['name'] for a in assets]}"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
download_url = ldb_asset["browser_download_url"]
|
|
326
|
+
asset_size = ldb_asset["size"]
|
|
327
|
+
|
|
328
|
+
logger.info(f"Downloading LDB from release {release_tag} ({asset_size / 1e9:.2f} GB)")
|
|
329
|
+
|
|
330
|
+
# Download with progress bar
|
|
331
|
+
cls._download_file(download_url, LDB_PATH, desc=f"LDB {release_tag}")
|
|
332
|
+
|
|
333
|
+
# Save version metadata
|
|
334
|
+
cls._save_meta(release_tag, download_url, asset_size)
|
|
335
|
+
|
|
336
|
+
# Reset initialized flag so next access reloads
|
|
337
|
+
cls._initialized = False
|
|
338
|
+
cls._invalidate_cache()
|
|
339
|
+
|
|
340
|
+
logger.info(f"LDB {release_tag} successfully installed to {LDB_PATH}")
|
|
341
|
+
|
|
342
|
+
@classmethod
|
|
343
|
+
def db_info(cls) -> dict | None:
|
|
344
|
+
"""
|
|
345
|
+
Return installed version info.
|
|
346
|
+
|
|
347
|
+
Returns
|
|
348
|
+
-------
|
|
349
|
+
:class:`dict` or :class:`None`
|
|
350
|
+
Dictionary with tag, date, size, path; or None if not installed.
|
|
351
|
+
"""
|
|
352
|
+
meta = cls._load_meta()
|
|
353
|
+
if meta is None or not LDB_PATH.exists():
|
|
354
|
+
return None
|
|
355
|
+
|
|
356
|
+
return {
|
|
357
|
+
"tag": meta.get("tag"),
|
|
358
|
+
"downloaded_at": meta.get("downloaded_at"),
|
|
359
|
+
"size": meta.get("size"),
|
|
360
|
+
"path": str(LDB_PATH),
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
@classmethod
|
|
364
|
+
def check_update(cls) -> dict | None:
|
|
365
|
+
"""
|
|
366
|
+
Check if a newer version is available on GitHub.
|
|
367
|
+
|
|
368
|
+
Returns
|
|
369
|
+
-------
|
|
370
|
+
:class:`dict` or None
|
|
371
|
+
Dictionary with update info if available, None if up to date.
|
|
372
|
+
"""
|
|
373
|
+
try:
|
|
374
|
+
release_info = cls._get_release_info()
|
|
375
|
+
latest_tag = release_info["tag_name"]
|
|
376
|
+
|
|
377
|
+
meta = cls._load_meta()
|
|
378
|
+
current_tag = meta.get("tag") if meta else None
|
|
379
|
+
|
|
380
|
+
if current_tag == latest_tag:
|
|
381
|
+
logger.info(f"LDB is up to date: {current_tag}")
|
|
382
|
+
return None
|
|
383
|
+
|
|
384
|
+
return {
|
|
385
|
+
"current": current_tag,
|
|
386
|
+
"latest": latest_tag,
|
|
387
|
+
"message": f"Update available: {current_tag or 'not installed'} -> {latest_tag}",
|
|
388
|
+
}
|
|
389
|
+
except RuntimeError as e:
|
|
390
|
+
logger.warning(f"Could not check for updates: {e}")
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
@classmethod
|
|
394
|
+
def dump(cls, filename: str, path=".", overwrite=False):
|
|
395
|
+
"""Save class state to file."""
|
|
396
|
+
# Convert numba dict to regular dict for pickling
|
|
397
|
+
nb_dict = None
|
|
398
|
+
if cls.search_engine is not None:
|
|
399
|
+
nb_dict = cls.search_engine.vectorizer.features_
|
|
400
|
+
cls.search_engine.vectorizer.features_ = dict(nb_dict)
|
|
401
|
+
|
|
402
|
+
state = {
|
|
403
|
+
'authors': cls.authors,
|
|
404
|
+
'publis': cls.publis,
|
|
405
|
+
'keys': cls.keys,
|
|
406
|
+
'search_engine': cls.search_engine,
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
# Use safe_write pattern from gismo.common
|
|
410
|
+
destination = Path(path) / f"{Path(filename).stem}.pkl.zst"
|
|
411
|
+
if destination.exists() and not overwrite:
|
|
412
|
+
print(f"File {destination} already exists! Use overwrite option to overwrite.")
|
|
413
|
+
else:
|
|
414
|
+
with safe_write(destination) as f:
|
|
415
|
+
cctx = zstd.ZstdCompressor(level=3)
|
|
416
|
+
with cctx.stream_writer(f) as z:
|
|
417
|
+
pickle.dump(state, z, protocol=5)
|
|
418
|
+
|
|
419
|
+
# Restore numba dict
|
|
420
|
+
if cls.search_engine is not None:
|
|
421
|
+
cls.search_engine.vectorizer.features_ = nb_dict
|
|
422
|
+
|
|
423
|
+
@classmethod
|
|
424
|
+
def load(cls, filename: str, path="."):
|
|
425
|
+
"""Load class state from file."""
|
|
426
|
+
dest = Path(path) / f"{Path(filename).stem}.pkl.zst"
|
|
427
|
+
if not dest.exists():
|
|
428
|
+
dest = dest.with_suffix(".pkl")
|
|
429
|
+
if not dest.exists():
|
|
430
|
+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dest)
|
|
431
|
+
|
|
432
|
+
dctx = zstd.ZstdDecompressor()
|
|
433
|
+
with open(dest, "rb") as f, dctx.stream_reader(f) as z:
|
|
434
|
+
state = pickle.load(z)
|
|
435
|
+
|
|
436
|
+
cls.authors = state['authors']
|
|
437
|
+
cls.publis = state['publis']
|
|
438
|
+
cls.keys = state['keys']
|
|
439
|
+
cls.search_engine = state['search_engine']
|
|
440
|
+
|
|
441
|
+
if cls.search_engine is not None:
|
|
442
|
+
cls.search_engine.vectorizer.features_ = cls.numbify_dict(
|
|
443
|
+
cls.search_engine.vectorizer.features_
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
cls._invalidate_cache()
|
|
447
|
+
cls._initialized = True
|
|
448
|
+
|
|
449
|
+
@classmethod
|
|
450
|
+
def dump_db(cls):
|
|
451
|
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
452
|
+
cls.dump(LDB_STEM, path=DATA_DIR, overwrite=True)
|
|
453
|
+
|
|
454
|
+
@classmethod
|
|
455
|
+
def load_db(cls):
|
|
456
|
+
try:
|
|
457
|
+
cls.load(LDB_STEM, path=DATA_DIR)
|
|
458
|
+
except FileNotFoundError:
|
|
459
|
+
logger.warning("No LDB installed. Build or retrieve before using.")
|
|
460
|
+
|
|
461
|
+
@staticmethod
|
|
462
|
+
def delete_db():
|
|
463
|
+
if LDB_PATH.exists():
|
|
464
|
+
LDB_PATH.unlink()
|
|
465
|
+
|
|
466
|
+
@staticmethod
|
|
467
|
+
def numbify_dict(input_dict):
|
|
468
|
+
nb_dict = nb.typed.Dict.empty(key_type=nb.types.unicode_type, value_type=nb.types.int64)
|
|
469
|
+
for k, v in input_dict.items():
|
|
470
|
+
nb_dict[k] = v
|
|
471
|
+
return nb_dict
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
@dataclass(repr=False)
|
|
475
|
+
class LDBAuthor(Author, LDB):
|
|
476
|
+
key: str
|
|
477
|
+
aliases: list = field(default_factory=list)
|
|
478
|
+
|
|
479
|
+
@property
|
|
480
|
+
def url(self):
|
|
481
|
+
return f"https://dblp.org/pid/{self.key}.html"
|
|
482
|
+
|
|
483
|
+
def get_publications(self):
|
|
484
|
+
return LDB.from_author(self)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
@dataclass(repr=False)
|
|
489
|
+
class LDBPublication(Publication, LDB):
|
|
490
|
+
key: str
|
|
491
|
+
metadata: dict = field(default_factory=dict)
|
|
492
|
+
|
|
493
|
+
@property
|
|
494
|
+
def url(self):
|
|
495
|
+
return self.metadata.get("url", f"https://dblp.org/rec/{self.key}.html")
|
|
496
|
+
|
|
497
|
+
@property
|
|
498
|
+
def stream(self):
|
|
499
|
+
if "streams" in self.metadata:
|
|
500
|
+
return f'https://dblp.org/streams/{self.metadata["streams"][0]}'
|
|
501
|
+
return None
|
gismap/sources/multi.py
CHANGED
|
@@ -15,7 +15,7 @@ def score_author_source(dbauthor):
|
|
|
15
15
|
return 2
|
|
16
16
|
else:
|
|
17
17
|
return 3
|
|
18
|
-
elif dbauthor.db_name
|
|
18
|
+
elif dbauthor.db_name in ["dblp", "ldb"]:
|
|
19
19
|
return 1
|
|
20
20
|
else:
|
|
21
21
|
return 0
|
|
@@ -69,7 +69,7 @@ class SourcedAuthor(Author):
|
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
publication_score_rosetta = {
|
|
72
|
-
"db_name": {"dblp": 1, "hal": 2},
|
|
72
|
+
"db_name": {"dblp": 1, "ldb": 1, "hal": 2},
|
|
73
73
|
"venue": {"CoRR": -1, "unpublished": -2},
|
|
74
74
|
"type": {"conference": 1, "journal": 2},
|
|
75
75
|
}
|
gismap/utils/common.py
CHANGED
|
@@ -30,7 +30,7 @@ def unlist(x):
|
|
|
30
30
|
return x[0] if (isinstance(x, list) and x) else x
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def get_classes(root, key="name"):
|
|
33
|
+
def get_classes(root, key="name", recurse=False):
|
|
34
34
|
"""
|
|
35
35
|
Parameters
|
|
36
36
|
----------
|
|
@@ -38,6 +38,8 @@ def get_classes(root, key="name"):
|
|
|
38
38
|
Starting class (can be abstract).
|
|
39
39
|
key: :class:`str`, default='name'
|
|
40
40
|
Attribute to look-up
|
|
41
|
+
recurse: bool, default=False
|
|
42
|
+
Recursively traverse subclasses.
|
|
41
43
|
|
|
42
44
|
Returns
|
|
43
45
|
-------
|
|
@@ -50,13 +52,16 @@ def get_classes(root, key="name"):
|
|
|
50
52
|
>>> from gismap.sources.models import DB
|
|
51
53
|
>>> subclasses = get_classes(DB, key='db_name')
|
|
52
54
|
>>> dict(sorted(subclasses.items())) # doctest: +NORMALIZE_WHITESPACE
|
|
53
|
-
{'dblp': <class 'gismap.sources.dblp.DBLP'>,
|
|
55
|
+
{'dblp': <class 'gismap.sources.dblp.DBLP'>,
|
|
56
|
+
'hal': <class 'gismap.sources.hal.HAL'>,
|
|
57
|
+
'ldb': <class 'gismap.sources.ldb.LDB'>}
|
|
54
58
|
"""
|
|
55
59
|
result = {
|
|
56
60
|
getattr(c, key): c for c in root.__subclasses__() if getattr(c, key, None)
|
|
57
61
|
}
|
|
58
|
-
|
|
59
|
-
|
|
62
|
+
if recurse:
|
|
63
|
+
for c in root.__subclasses__():
|
|
64
|
+
result.update(get_classes(c, key=key, recurse=True))
|
|
60
65
|
return result
|
|
61
66
|
|
|
62
67
|
|
|
@@ -83,20 +88,20 @@ def list_of_objects(clss, dico, default=None):
|
|
|
83
88
|
|
|
84
89
|
>>> from gismap.sources.models import DB
|
|
85
90
|
>>> subclasses = get_classes(DB, key='db_name')
|
|
86
|
-
>>> from gismap import HAL, DBLP
|
|
87
|
-
>>> list_of_objects([HAL, '
|
|
88
|
-
[<class 'gismap.sources.hal.HAL'>, <class 'gismap.sources.
|
|
91
|
+
>>> from gismap import HAL, DBLP, LDB
|
|
92
|
+
>>> list_of_objects([HAL, 'ldb'], subclasses)
|
|
93
|
+
[<class 'gismap.sources.hal.HAL'>, <class 'gismap.sources.ldb.LDB'>]
|
|
89
94
|
>>> list_of_objects(None, subclasses, [DBLP])
|
|
90
95
|
[<class 'gismap.sources.dblp.DBLP'>]
|
|
91
|
-
>>> list_of_objects(
|
|
92
|
-
[<class 'gismap.sources.
|
|
96
|
+
>>> list_of_objects(LDB, subclasses)
|
|
97
|
+
[<class 'gismap.sources.ldb.LDB'>]
|
|
93
98
|
>>> list_of_objects('hal', subclasses)
|
|
94
99
|
[<class 'gismap.sources.hal.HAL'>]
|
|
95
100
|
"""
|
|
96
101
|
if default is None:
|
|
97
102
|
default = []
|
|
98
103
|
if clss is None:
|
|
99
|
-
return default
|
|
104
|
+
return list_of_objects(clss=default, dico=dico)
|
|
100
105
|
elif isinstance(clss, str):
|
|
101
106
|
return [dico[clss]]
|
|
102
107
|
elif isinstance(clss, list):
|
gismap/utils/logger.py
CHANGED
gismap/utils/requests.py
CHANGED
|
@@ -21,7 +21,9 @@ def get(url, params=None, n_trials=10, verify=True):
|
|
|
21
21
|
Entry point to fetch.
|
|
22
22
|
params: :class:`dict`, optional
|
|
23
23
|
Get arguments (appended to URL).
|
|
24
|
-
|
|
24
|
+
n_trials: :class:`int`, default=10
|
|
25
|
+
Number of attempts to fetch URL.
|
|
26
|
+
verify: :class:`bool`, default=True
|
|
25
27
|
Verify certificates.
|
|
26
28
|
|
|
27
29
|
Returns
|
gismap/utils/zlist.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from gismo.common import MixInIO
|
|
2
|
+
import zstandard as zstd
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pickle
|
|
5
|
+
|
|
6
|
+
dctx = zstd.ZstdDecompressor()
|
|
7
|
+
cctx = zstd.ZstdCompressor()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ZList(MixInIO):
|
|
11
|
+
"""
|
|
12
|
+
List compressed by frames of elements. Allows to store compressed data in memory with decent seek and scan.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
frame_size: :class:`int`
|
|
17
|
+
Size of each frame in number of elements.
|
|
18
|
+
"""
|
|
19
|
+
def __init__(self, frame_size=1000):
|
|
20
|
+
self.frame_size = frame_size
|
|
21
|
+
self.frame = None
|
|
22
|
+
self._frame_index = None
|
|
23
|
+
self._blob = None
|
|
24
|
+
self._off = None
|
|
25
|
+
self._n = None
|
|
26
|
+
self._batch = None
|
|
27
|
+
|
|
28
|
+
def _merge_batch(self):
|
|
29
|
+
if self._batch:
|
|
30
|
+
frame = cctx.compress(pickle.dumps(self._batch))
|
|
31
|
+
self._blob += frame
|
|
32
|
+
self._off.append(len(self._blob))
|
|
33
|
+
self._batch = []
|
|
34
|
+
|
|
35
|
+
def append(self, entry):
|
|
36
|
+
self._batch.append(entry)
|
|
37
|
+
self._n += 1
|
|
38
|
+
if len(self._batch) == self.frame_size:
|
|
39
|
+
self._merge_batch()
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def size(self):
|
|
43
|
+
return len(self._blob)
|
|
44
|
+
|
|
45
|
+
def __enter__(self):
|
|
46
|
+
self._blob = bytearray()
|
|
47
|
+
self._off = [0]
|
|
48
|
+
self._n = 0
|
|
49
|
+
self._batch = []
|
|
50
|
+
return self
|
|
51
|
+
|
|
52
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
53
|
+
self._merge_batch()
|
|
54
|
+
self._blob = bytes(self._blob)
|
|
55
|
+
self._off = np.array(self._off, dtype=int)
|
|
56
|
+
|
|
57
|
+
def load_frame(self, f):
|
|
58
|
+
self.frame = pickle.loads(dctx.decompress(self._blob[self._off[f]:self._off[f + 1]]))
|
|
59
|
+
|
|
60
|
+
def __getitem__(self, i):
|
|
61
|
+
g, f = i // self.frame_size, i % self.frame_size
|
|
62
|
+
if g != self._frame_index:
|
|
63
|
+
self.load_frame(g)
|
|
64
|
+
self._frame_index = g
|
|
65
|
+
return self.frame[f]
|
|
66
|
+
|
|
67
|
+
def __len__(self):
|
|
68
|
+
return self._n
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gismap
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: GisMap leverages DBLP and HAL databases to provide cartography tools for you and your lab.
|
|
5
5
|
Project-URL: Repository, https://github.com/balouf/gismap
|
|
6
6
|
Project-URL: Documentation, https://balouf.github.io/gismap
|
|
@@ -9,12 +9,14 @@ Maintainer-email: Fabien Mathieu <fabien.mathieu@normalesup.org>
|
|
|
9
9
|
License-Expression: MIT
|
|
10
10
|
License-File: AUTHORS.md
|
|
11
11
|
Requires-Python: >=3.10
|
|
12
|
+
Requires-Dist: beautifulsoup4>=4.14.2
|
|
12
13
|
Requires-Dist: bof>=0.3.5
|
|
13
14
|
Requires-Dist: distinctipy>=1.3.4
|
|
14
15
|
Requires-Dist: domonic>=0.9.13
|
|
15
16
|
Requires-Dist: gismo>=0.5.2
|
|
16
17
|
Requires-Dist: ipykernel>=6.30.1
|
|
17
18
|
Requires-Dist: ipywidgets>=8.1.8
|
|
19
|
+
Requires-Dist: platformdirs>=4.5.0
|
|
18
20
|
Requires-Dist: tqdm>=4.67.1
|
|
19
21
|
Description-Content-Type: text/markdown
|
|
20
22
|
|
|
@@ -61,17 +63,28 @@ Install GisMap:
|
|
|
61
63
|
$ pip install gismap
|
|
62
64
|
```
|
|
63
65
|
|
|
64
|
-
Use GisMap to
|
|
66
|
+
Use GisMap to display a collaboration graph (HTML) from a Notebook:
|
|
65
67
|
|
|
66
68
|
```pycon
|
|
67
|
-
>>> from gismap.
|
|
68
|
-
>>>
|
|
69
|
-
>>> lab = ListLab(["Fabien Mathieu", "François Baccelli", "Ludovic Noirie", "Céline Comte", "Sébastien Tixeuil"], dbs="hal")
|
|
69
|
+
>>> from gismap.lab import ListMap
|
|
70
|
+
>>> lab = ListMap(["Fabien Mathieu", "François Baccelli", "Ludovic Noirie", "Céline Comte", "Sébastien Tixeuil"], dbs="hal")
|
|
70
71
|
>>> lab.update_authors()
|
|
71
72
|
>>> lab.update_publis()
|
|
72
73
|
>>> lab.show_html()
|
|
73
74
|
```
|
|
74
75
|
|
|
76
|
+
If you are not using Jupyter Lab/Notebook, rich display will not work.
|
|
77
|
+
Instead, save the HTML and display it on your browser:
|
|
78
|
+
|
|
79
|
+
```pycon
|
|
80
|
+
>>> from gismap.lab import ListMap
|
|
81
|
+
>>> lab = ListMap(["Fabien Mathieu", "François Baccelli", "Ludovic Noirie", "Céline Comte", "Sébastien Tixeuil"], dbs="hal")
|
|
82
|
+
>>> lab.update_authors()
|
|
83
|
+
>>> lab.update_publis()
|
|
84
|
+
>>> lab.save_html("my_graph")
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
|
|
75
88
|
## Credits
|
|
76
89
|
|
|
77
90
|
This package was created with [Cookiecutter][CC] and the [Package Helper 3][PH3] project template.
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
gismap/__init__.py,sha256=
|
|
1
|
+
gismap/__init__.py,sha256=FHZLy3T2zFVrFRe3sqSRTuXkZ6DYise9HtXCWWfXrys,866
|
|
2
2
|
gismap/author.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
gismap/build.py,sha256=1oNs3qjm2DNkOP19iVDPderF2Sx3w5qJH4SgirDHKcU,103
|
|
3
4
|
gismap/gismap.py,sha256=h0hwdogXGFqerm-5ZPeT-irPn91pCcQRjiHThXsRzEk,19
|
|
4
5
|
gismap/gismo.py,sha256=oDAryl4XQzHE0tUmOWC-3G1n_zUgTeykPL-JWSDYwe0,6307
|
|
5
6
|
gismap/search.py,sha256=nsUoDsFGeEtvCZ0dB7ooRPC_6qsazkiWx_oM7dHdNV4,4932
|
|
@@ -10,29 +11,33 @@ gismap/gisgraphs/groups.py,sha256=1E-7Xrv0uDw2SgqwtdjgeRLVBLaC7agUrrVics4jVLs,24
|
|
|
10
11
|
gismap/gisgraphs/js.py,sha256=Gbz5nMWORabZkgIdyZAe1sMlnwJZ9jy7sLrx0vYStzI,6283
|
|
11
12
|
gismap/gisgraphs/options.py,sha256=lmUSnfSwrZQyJpGGs16JUGDIQNcJeX4Y0tA8cyC0nuM,817
|
|
12
13
|
gismap/gisgraphs/style.py,sha256=sXNUnv690kxiJiRQZ7lv4iKKrsxMqAfblheJbqesd48,4653
|
|
13
|
-
gismap/gisgraphs/widget.py,sha256=
|
|
14
|
+
gismap/gisgraphs/widget.py,sha256=ccTgmfs1-23aVFnOv09aKMf07pfsEsgeLdcywVELzL8,4537
|
|
14
15
|
gismap/lab/__init__.py,sha256=ifyZqI9BpC5NRlMfSmJ671tnKWJDoXbo18iDoE-VR1s,181
|
|
15
16
|
gismap/lab/egomap.py,sha256=RabRJSWJ0xrG67l012En0rbi7ukr4R2lR0hc_K7Xp0o,1211
|
|
16
17
|
gismap/lab/expansion.py,sha256=CMUsXqo-shRyb_MiuPRL5-ZgaitxAxjfbSY_fvzi_1E,6236
|
|
17
18
|
gismap/lab/filters.py,sha256=pG_g2POQXMbyUUw0aXOaeyiGBbiSc7M2NzxLCTQrALk,1875
|
|
18
|
-
gismap/lab/lab_author.py,sha256=
|
|
19
|
-
gismap/lab/labmap.py,sha256=
|
|
19
|
+
gismap/lab/lab_author.py,sha256=tiv6Z2RUrmfba0zYNS83cPTwN2YyGj7_bcqN2Ak_JXk,4420
|
|
20
|
+
gismap/lab/labmap.py,sha256=jDXFIxe0Jk89wUaweodPxN2thxMgi-hgnqSavhaapZc,5748
|
|
20
21
|
gismap/lab_examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
22
|
gismap/lab_examples/cedric.py,sha256=AjgYy5dhzqh3vDsr9ia_hbtSc9_2Ic238rmJO198FMM,1764
|
|
23
|
+
gismap/lab_examples/lamsade.py,sha256=m5uDT9IGpBT1ARknKl44WmFv5b_tLWfvtOjgOThp5fA,1294
|
|
22
24
|
gismap/lab_examples/lincs.py,sha256=-mIVMGQMrtCtJ3N-oCU8j4Ko9mDuhEPB_pA0gaIw4QA,1126
|
|
23
25
|
gismap/lab_examples/lip6.py,sha256=K32Jqe3-o99QYI--akmwBDFAWKgq0HFEk_psC4akR60,1740
|
|
24
26
|
gismap/lab_examples/toulouse.py,sha256=OUKrK0uefn4uvW74qMsF792un203z3OUfKTquLPGBH4,2091
|
|
25
27
|
gismap/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
-
gismap/sources/dblp.py,sha256=
|
|
27
|
-
gismap/sources/
|
|
28
|
+
gismap/sources/dblp.py,sha256=FXVsRhrPc0iqsd_a9cMzUYB5YdMxOC4ho3Ip4lCyjtE,4834
|
|
29
|
+
gismap/sources/dblp_ttl.py,sha256=JI_1C7yv1T8TfXMfLNPSFBbCoghYMYoDY7s6K_2arUs,5456
|
|
30
|
+
gismap/sources/hal.py,sha256=VOd7mEUeM0wcfetHYYsX5n4jXNVYQKP12G-iNQsa0XE,10313
|
|
31
|
+
gismap/sources/ldb.py,sha256=KEHREkne7hUy-04VKJOlvzkJQhvKKZJADcvhEBLCgfY,16766
|
|
28
32
|
gismap/sources/models.py,sha256=XlNrQWTF-DQbfIFaSLPsgWPN-c79_0rfr_2jDasgukM,713
|
|
29
|
-
gismap/sources/multi.py,sha256=
|
|
33
|
+
gismap/sources/multi.py,sha256=QlVtuQasznXSXSmJryWFWb2ZmaOOJFoEpgn2Js-IGcc,4709
|
|
30
34
|
gismap/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
-
gismap/utils/common.py,sha256=
|
|
32
|
-
gismap/utils/logger.py,sha256=
|
|
33
|
-
gismap/utils/requests.py,sha256=
|
|
35
|
+
gismap/utils/common.py,sha256=6JhdB_EJnaXwnBGiJutPx5vFEr4wYEvsqKcivVDbGMk,3115
|
|
36
|
+
gismap/utils/logger.py,sha256=zvOPJqC7V6GV4Ov8M9-tnK63c2poDAEcWq_UarOLcpg,117
|
|
37
|
+
gismap/utils/requests.py,sha256=ZSKYJ08MlEtJTHdKYi61KxK6RjYxTBNxWjEUH-EtbbI,1468
|
|
34
38
|
gismap/utils/text.py,sha256=1_9DlduAYh7Nz-yAg-MaCTmdKbPPmuIY20bb87t7JAQ,3810
|
|
35
|
-
gismap
|
|
36
|
-
gismap-0.
|
|
37
|
-
gismap-0.
|
|
38
|
-
gismap-0.
|
|
39
|
+
gismap/utils/zlist.py,sha256=F66rilTalbRgqiJaPIxDJxKs_2KFOp2ZEH8Ef_CRxYA,1810
|
|
40
|
+
gismap-0.4.0.dist-info/METADATA,sha256=wrptTFqdKckSixC0KEzzQ8pVH5aes2vDfz-5NKFrS-A,3903
|
|
41
|
+
gismap-0.4.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
42
|
+
gismap-0.4.0.dist-info/licenses/AUTHORS.md,sha256=oDR4mptVUBMq0WKIpt19Km1Bdfz3cO2NAOVgwVfTO8g,131
|
|
43
|
+
gismap-0.4.0.dist-info/RECORD,,
|
|
File without changes
|