gismap 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gismap/__init__.py +1 -0
- gismap/build.py +26 -0
- gismap/gisgraphs/builder.py +23 -8
- gismap/gisgraphs/graph.py +15 -0
- gismap/gisgraphs/widget.py +23 -3
- gismap/lab/egomap.py +29 -7
- gismap/lab/expansion.py +35 -1
- gismap/lab/lab_author.py +34 -7
- gismap/lab/labmap.py +46 -6
- gismap/lab_examples/cedric.py +19 -6
- gismap/lab_examples/lamsade.py +45 -0
- gismap/lab_examples/toulouse.py +6 -2
- gismap/search.py +61 -1
- gismap/sources/dblp.py +23 -1
- gismap/sources/dblp_ttl.py +187 -0
- gismap/sources/hal.py +40 -2
- gismap/sources/ldb.py +716 -0
- gismap/sources/models.py +83 -0
- gismap/sources/multi.py +67 -2
- gismap/utils/common.py +73 -11
- gismap/utils/logger.py +2 -0
- gismap/utils/requests.py +3 -1
- gismap/utils/text.py +1 -1
- gismap/utils/zlist.py +88 -0
- {gismap-0.3.0.dist-info → gismap-0.4.1.dist-info}/METADATA +26 -11
- gismap-0.4.1.dist-info/RECORD +43 -0
- {gismap-0.3.0.dist-info → gismap-0.4.1.dist-info}/WHEEL +1 -1
- gismap-0.3.0.dist-info/RECORD +0 -38
- {gismap-0.3.0.dist-info → gismap-0.4.1.dist-info}/licenses/AUTHORS.md +0 -0
gismap/sources/dblp.py
CHANGED
|
@@ -13,7 +13,7 @@ from gismap.utils.requests import get
|
|
|
13
13
|
class DBLP(DB):
|
|
14
14
|
db_name: ClassVar[str] = "dblp"
|
|
15
15
|
author_backoff: ClassVar[float] = 5.0
|
|
16
|
-
publi_backoff: ClassVar[float] =
|
|
16
|
+
publi_backoff: ClassVar[float] = 5.0
|
|
17
17
|
|
|
18
18
|
@classmethod
|
|
19
19
|
def search_author(cls, name, wait=True):
|
|
@@ -98,6 +98,7 @@ class DBLPAuthor(Author, DBLP):
|
|
|
98
98
|
key='conf/sss/Mathieu07')
|
|
99
99
|
|
|
100
100
|
"""
|
|
101
|
+
|
|
101
102
|
key: str
|
|
102
103
|
aliases: list = field(default_factory=list)
|
|
103
104
|
|
|
@@ -124,6 +125,27 @@ DBLP_TYPES = {
|
|
|
124
125
|
|
|
125
126
|
@dataclass(repr=False)
|
|
126
127
|
class DBLPPublication(Publication, DBLP):
|
|
128
|
+
"""
|
|
129
|
+
Publication from the DBLP database.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
title: :class:`str`
|
|
134
|
+
Publication title.
|
|
135
|
+
authors: :class:`list`
|
|
136
|
+
List of :class:`DBLPAuthor` objects.
|
|
137
|
+
venue: :class:`str`
|
|
138
|
+
Publication venue.
|
|
139
|
+
type: :class:`str`
|
|
140
|
+
Publication type.
|
|
141
|
+
year: :class:`int`
|
|
142
|
+
Publication year.
|
|
143
|
+
key: :class:`str`
|
|
144
|
+
DBLP record key.
|
|
145
|
+
metadata: :class:`dict`
|
|
146
|
+
Additional metadata (pages, volume, etc.).
|
|
147
|
+
"""
|
|
148
|
+
|
|
127
149
|
key: str
|
|
128
150
|
metadata: dict = field(default_factory=dict)
|
|
129
151
|
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import zlib
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
|
|
8
|
+
from gismap.utils.requests import session
|
|
9
|
+
from gismap.sources.dblp import DBLP_TYPES
|
|
10
|
+
|
|
11
|
+
key_re = r"<https://dblp.org/rec/([^>]+)>"
|
|
12
|
+
title_re = r'.*?dblp:title\s+"([^"]+)"'
|
|
13
|
+
type_re = r".*?dblp:bibtexType\s+bibtex:(\w+)"
|
|
14
|
+
authors_re = r".*?dblp:hasSignature\s+(\[.*\])\s*;"
|
|
15
|
+
url_re = r"(?:.*?dblp:primaryDocumentPage <([^>]+)>)?"
|
|
16
|
+
stream_re = r"(?:.*?dblp:publishedInStream ([^;]+) ;)?"
|
|
17
|
+
pages_re = r'(?:.*?dblp:pagination "([^"]+)")?'
|
|
18
|
+
venue_re = r'(?:.*?dblp:publishedIn\s+"([^"]+?)")?'
|
|
19
|
+
year_re = r'.*?"(\d{4})"\^\^<http://www.w3.org/2001/XMLSchema#gYear>'
|
|
20
|
+
|
|
21
|
+
pub_re = re.compile(
|
|
22
|
+
"".join(
|
|
23
|
+
[
|
|
24
|
+
key_re,
|
|
25
|
+
title_re,
|
|
26
|
+
type_re,
|
|
27
|
+
authors_re,
|
|
28
|
+
url_re,
|
|
29
|
+
stream_re,
|
|
30
|
+
pages_re,
|
|
31
|
+
venue_re,
|
|
32
|
+
year_re,
|
|
33
|
+
]
|
|
34
|
+
),
|
|
35
|
+
flags=re.S,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
streams_re = re.compile(r"<https://dblp.org/streams/((?:conf|journals)/[^>]+)>")
|
|
39
|
+
|
|
40
|
+
authid_re = re.compile(
|
|
41
|
+
r'\[.*?signatureDblpName\s*?"([^"]+?)(?:\s+\d+)?".*?signatureCreator\s*<https://dblp.org/pid/([^>]+?)>.*?]',
|
|
42
|
+
flags=re.S,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def parse_block(dblp_block):
|
|
47
|
+
"""
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
dblp_block: :class:`str`
|
|
51
|
+
A DBLP publication, turtle format.
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
key: :class:`str`
|
|
56
|
+
DBLP key.
|
|
57
|
+
title: :class:`str`
|
|
58
|
+
Publication title.
|
|
59
|
+
type: :class:`str`
|
|
60
|
+
Type of publication.
|
|
61
|
+
authors: :class:`dict`
|
|
62
|
+
Publication authors (key -> name)
|
|
63
|
+
url: :class:`str` or :class:`NoneType`
|
|
64
|
+
Publication URL.
|
|
65
|
+
stream: :class:`list` or :class:`NoneType`
|
|
66
|
+
Publication streams (normalized journal/conf).
|
|
67
|
+
pages: :class:`str` or :class:`NoneType`
|
|
68
|
+
Publication pages.
|
|
69
|
+
venue: :class:`str` or :class:`NoneType`
|
|
70
|
+
Publication venue (conf/journal).
|
|
71
|
+
year: :class:`int`
|
|
72
|
+
Year of publication.
|
|
73
|
+
"""
|
|
74
|
+
items = pub_re.search(dblp_block)
|
|
75
|
+
if items is None:
|
|
76
|
+
return None
|
|
77
|
+
key, title, typ, authors, url, stream, pages, venue, year = items.groups()
|
|
78
|
+
typ = typ.lower()
|
|
79
|
+
typ = DBLP_TYPES.get(typ, typ)
|
|
80
|
+
if stream:
|
|
81
|
+
stream = streams_re.findall(stream)
|
|
82
|
+
authors = {i: n for n, i in authid_re.findall(authors)}
|
|
83
|
+
if authors:
|
|
84
|
+
return key, title, typ, authors, url, stream, pages, venue, int(year)
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@contextmanager
|
|
89
|
+
def get_stream(source, chunk_size=1024 * 64):
|
|
90
|
+
"""
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
source: :class:`str` or :class:`~pathlib.Path`
|
|
94
|
+
Where the content. Can be on a local file or on the Internet.
|
|
95
|
+
chunk_size: :class:`int`, optional
|
|
96
|
+
Desired chunk size. For streaming gz content, must be a multiple of 32kB.
|
|
97
|
+
|
|
98
|
+
Yields
|
|
99
|
+
-------
|
|
100
|
+
iterable
|
|
101
|
+
Chunk iterator that streams the content.
|
|
102
|
+
:class:`int`
|
|
103
|
+
Source size (used later to compute ETA).
|
|
104
|
+
"""
|
|
105
|
+
if isinstance(source, str) and source.startswith("https://"):
|
|
106
|
+
# URL HTTP
|
|
107
|
+
with session.get(source, stream=True) as r:
|
|
108
|
+
r.raise_for_status()
|
|
109
|
+
total = int(r.headers.get("content-length", 0)) or None
|
|
110
|
+
yield r.iter_content(chunk_size=chunk_size), total
|
|
111
|
+
else:
|
|
112
|
+
source = Path(source)
|
|
113
|
+
if not source.exists():
|
|
114
|
+
yield [], 0
|
|
115
|
+
return None
|
|
116
|
+
total = source.stat().st_size
|
|
117
|
+
with source.open("rb") as file_handle:
|
|
118
|
+
|
|
119
|
+
def read_chunks():
|
|
120
|
+
while True:
|
|
121
|
+
chunk = file_handle.read(chunk_size)
|
|
122
|
+
if not chunk:
|
|
123
|
+
break
|
|
124
|
+
yield chunk
|
|
125
|
+
|
|
126
|
+
yield read_chunks(), total
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def publis_streamer(source, chunk_size=1024 * 64, encoding="unicode_escape"):
|
|
130
|
+
"""
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
source: :class:`str` or :class:`~pathlib.Path`
|
|
134
|
+
Where the DBLP turtle content is. Can be on a local file or on the Internet.
|
|
135
|
+
chunk_size: :class:`int`, optional
|
|
136
|
+
Desired chunk size. Must be a multiple of 32kB.
|
|
137
|
+
encoding: :class:`str`, default=unicode_escape
|
|
138
|
+
Encoding of stream.
|
|
139
|
+
|
|
140
|
+
Yields
|
|
141
|
+
-------
|
|
142
|
+
key: :class:`str`
|
|
143
|
+
DBLP key.
|
|
144
|
+
title: :class:`str`
|
|
145
|
+
Publication title.
|
|
146
|
+
type: :class:`str`
|
|
147
|
+
Type of publication.
|
|
148
|
+
authors: :class:`dict`
|
|
149
|
+
Publication authors (key -> name).
|
|
150
|
+
venue: :class:`str`
|
|
151
|
+
Publication venue (conf/journal).
|
|
152
|
+
year: :class:`int`
|
|
153
|
+
Year of publication.
|
|
154
|
+
"""
|
|
155
|
+
with get_stream(source, chunk_size=chunk_size) as (stream, total):
|
|
156
|
+
with tqdm(
|
|
157
|
+
total=total, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing"
|
|
158
|
+
) as pbar:
|
|
159
|
+
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
160
|
+
text_buffer = ""
|
|
161
|
+
for chunk in stream:
|
|
162
|
+
if not chunk:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
pbar.update(len(chunk))
|
|
166
|
+
data = decomp.decompress(chunk)
|
|
167
|
+
if not data:
|
|
168
|
+
continue
|
|
169
|
+
text_buffer += data.decode(encoding, errors="replace")
|
|
170
|
+
|
|
171
|
+
blocks = text_buffer.split("\n\n")
|
|
172
|
+
text_buffer = blocks[-1]
|
|
173
|
+
for block in blocks[:-1]:
|
|
174
|
+
pub = parse_block(block)
|
|
175
|
+
if pub:
|
|
176
|
+
yield pub
|
|
177
|
+
|
|
178
|
+
data = decomp.flush()
|
|
179
|
+
if data:
|
|
180
|
+
text_buffer += data.decode(encoding, errors="replace")
|
|
181
|
+
|
|
182
|
+
if text_buffer:
|
|
183
|
+
blocks = text_buffer.split("\n\n")
|
|
184
|
+
for block in blocks:
|
|
185
|
+
pub = parse_block(block)
|
|
186
|
+
if pub:
|
|
187
|
+
yield pub
|
gismap/sources/hal.py
CHANGED
|
@@ -121,8 +121,8 @@ class HAL(DB):
|
|
|
121
121
|
>>> diego = publications[2].authors[3]
|
|
122
122
|
>>> diego
|
|
123
123
|
HALAuthor(name='Diego Perino', key='Diego Perino', key_type='fullname')
|
|
124
|
-
>>> len(diego.get_publications())
|
|
125
|
-
|
|
124
|
+
>>> len(diego.get_publications()) > 28
|
|
125
|
+
True
|
|
126
126
|
>>> publications[-7] # doctest: +NORMALIZE_WHITESPACE
|
|
127
127
|
HALPublication(title='Upper bounds for stabilization in acyclic preference-based systems',
|
|
128
128
|
authors=[HALAuthor(name='Fabien Mathieu', key='fabien-mathieu')],
|
|
@@ -180,6 +180,23 @@ class HAL(DB):
|
|
|
180
180
|
|
|
181
181
|
@dataclass(repr=False)
|
|
182
182
|
class HALAuthor(Author, HAL):
|
|
183
|
+
"""
|
|
184
|
+
Author from the HAL (Hyper Articles en Ligne) database.
|
|
185
|
+
|
|
186
|
+
HAL is a French open archive for scholarly publications.
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
name: :class:`str`
|
|
191
|
+
The author's name.
|
|
192
|
+
key: :class:`str` or :class:`int`, optional
|
|
193
|
+
HAL identifier for the author.
|
|
194
|
+
key_type: :class:`str`, optional
|
|
195
|
+
Type of key ('pid', 'fullname', or None for idHal).
|
|
196
|
+
aliases: :class:`list`
|
|
197
|
+
Alternative names for the author.
|
|
198
|
+
"""
|
|
199
|
+
|
|
183
200
|
key: str | int = None
|
|
184
201
|
key_type: str = None
|
|
185
202
|
aliases: list = field(default_factory=list)
|
|
@@ -277,6 +294,27 @@ HAL_KEYS = {
|
|
|
277
294
|
|
|
278
295
|
@dataclass(repr=False)
|
|
279
296
|
class HALPublication(Publication, HAL):
|
|
297
|
+
"""
|
|
298
|
+
Publication from the HAL database.
|
|
299
|
+
|
|
300
|
+
Parameters
|
|
301
|
+
----------
|
|
302
|
+
title: :class:`str`
|
|
303
|
+
Publication title.
|
|
304
|
+
authors: :class:`list`
|
|
305
|
+
List of :class:`HALAuthor` objects.
|
|
306
|
+
venue: :class:`str`
|
|
307
|
+
Publication venue.
|
|
308
|
+
type: :class:`str`
|
|
309
|
+
Publication type.
|
|
310
|
+
year: :class:`int`
|
|
311
|
+
Publication year.
|
|
312
|
+
key: :class:`str`
|
|
313
|
+
HAL document identifier.
|
|
314
|
+
metadata: :class:`dict`
|
|
315
|
+
Additional metadata (abstract, URL, etc.).
|
|
316
|
+
"""
|
|
317
|
+
|
|
280
318
|
key: str
|
|
281
319
|
metadata: dict = field(default_factory=dict)
|
|
282
320
|
|