gismap 0.2.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gismap/__init__.py +2 -0
- gismap/build.py +4 -0
- gismap/gisgraphs/__init__.py +0 -0
- gismap/gisgraphs/builder.py +105 -0
- gismap/{lab → gisgraphs}/graph.py +70 -66
- gismap/gisgraphs/groups.py +70 -0
- gismap/gisgraphs/js.py +190 -0
- gismap/gisgraphs/options.py +37 -0
- gismap/gisgraphs/style.py +119 -0
- gismap/gisgraphs/widget.py +145 -0
- gismap/lab/__init__.py +0 -4
- gismap/lab/egomap.py +6 -7
- gismap/lab/expansion.py +7 -6
- gismap/lab/filters.py +1 -1
- gismap/lab/lab_author.py +50 -6
- gismap/lab/labmap.py +7 -6
- gismap/lab_examples/__init__.py +0 -0
- gismap/lab_examples/cedric.py +46 -0
- gismap/lab_examples/lamsade.py +43 -0
- gismap/{lab → lab_examples}/lincs.py +2 -2
- gismap/{lab → lab_examples}/toulouse.py +20 -3
- gismap/sources/dblp.py +16 -18
- gismap/sources/dblp_ttl.py +168 -0
- gismap/sources/hal.py +19 -10
- gismap/sources/ldb.py +501 -0
- gismap/sources/models.py +7 -0
- gismap/sources/multi.py +25 -17
- gismap/utils/common.py +15 -10
- gismap/utils/logger.py +2 -0
- gismap/utils/requests.py +6 -2
- gismap/utils/zlist.py +68 -0
- {gismap-0.2.2.dist-info → gismap-0.4.0.dist-info}/METADATA +37 -8
- gismap-0.4.0.dist-info/RECORD +43 -0
- {gismap-0.2.2.dist-info → gismap-0.4.0.dist-info}/WHEEL +1 -1
- gismap/lab/vis.py +0 -329
- gismap-0.2.2.dist-info/RECORD +0 -30
- /gismap/{lab → lab_examples}/lip6.py +0 -0
- {gismap-0.2.2.dist-info → gismap-0.4.0.dist-info}/licenses/AUTHORS.md +0 -0
gismap/sources/dblp.py
CHANGED
|
@@ -13,7 +13,7 @@ from gismap.utils.requests import get
|
|
|
13
13
|
class DBLP(DB):
|
|
14
14
|
db_name: ClassVar[str] = "dblp"
|
|
15
15
|
author_backoff: ClassVar[float] = 5.0
|
|
16
|
-
publi_backoff: ClassVar[float] =
|
|
16
|
+
publi_backoff: ClassVar[float] = 5.0
|
|
17
17
|
|
|
18
18
|
@classmethod
|
|
19
19
|
def search_author(cls, name, wait=True):
|
|
@@ -72,23 +72,6 @@ class DBLP(DB):
|
|
|
72
72
|
Papers available in DBLP.
|
|
73
73
|
wait: :class:`bool`
|
|
74
74
|
Wait a bit to avoid 429.
|
|
75
|
-
|
|
76
|
-
Examples
|
|
77
|
-
--------
|
|
78
|
-
|
|
79
|
-
>>> fabien = DBLPAuthor('Fabien Mathieu', key='66/2077')
|
|
80
|
-
>>> publications = sorted(DBLP.from_author(fabien),
|
|
81
|
-
... key=lambda p: p.title)
|
|
82
|
-
>>> publications[0] # doctest: +NORMALIZE_WHITESPACE
|
|
83
|
-
DBLPPublication(title='Achievable catalog size in peer-to-peer video-on-demand systems.',
|
|
84
|
-
authors=[DBLPAuthor(name='Yacine Boufkhad', key='75/5742'), DBLPAuthor(name='Fabien Mathieu', key='66/2077'),
|
|
85
|
-
DBLPAuthor(name='Fabien de Montgolfier', key='57/6313'), DBLPAuthor(name='Diego Perino', key='03/3645'),
|
|
86
|
-
DBLPAuthor(name='Laurent Viennot', key='v/LaurentViennot')],
|
|
87
|
-
venue='IPTPS', type='conference', year=2008, key='conf/iptps/BoufkhadMMPV08')
|
|
88
|
-
>>> publications[-1] # doctest: +NORMALIZE_WHITESPACE
|
|
89
|
-
DBLPPublication(title='Upper Bounds for Stabilization in Acyclic Preference-Based Systems.',
|
|
90
|
-
authors=[DBLPAuthor(name='Fabien Mathieu', key='66/2077')], venue='SSS', type='conference', year=2007,
|
|
91
|
-
key='conf/sss/Mathieu07')
|
|
92
75
|
"""
|
|
93
76
|
r = get(f"https://dblp.org/pid/{a.key}.xml")
|
|
94
77
|
soup = Soup(r, features="xml")
|
|
@@ -100,6 +83,21 @@ class DBLP(DB):
|
|
|
100
83
|
|
|
101
84
|
@dataclass(repr=False)
|
|
102
85
|
class DBLPAuthor(Author, DBLP):
|
|
86
|
+
"""
|
|
87
|
+
Examples
|
|
88
|
+
--------
|
|
89
|
+
|
|
90
|
+
>>> fabien = DBLPAuthor('Fabien Mathieu', key='66/2077')
|
|
91
|
+
>>> publications = sorted(fabien.get_publications(),
|
|
92
|
+
... key=lambda p: p.title)
|
|
93
|
+
>>> publications[0].url # doctest: +NORMALIZE_WHITESPACE
|
|
94
|
+
'https://dblp.org/rec/conf/iptps/BoufkhadMMPV08.html'
|
|
95
|
+
>>> publications[-1] # doctest: +NORMALIZE_WHITESPACE
|
|
96
|
+
DBLPPublication(title='Upper Bounds for Stabilization in Acyclic Preference-Based Systems.',
|
|
97
|
+
authors=[DBLPAuthor(name='Fabien Mathieu', key='66/2077')], venue='SSS', type='conference', year=2007,
|
|
98
|
+
key='conf/sss/Mathieu07')
|
|
99
|
+
|
|
100
|
+
"""
|
|
103
101
|
key: str
|
|
104
102
|
aliases: list = field(default_factory=list)
|
|
105
103
|
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import zlib
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
|
|
8
|
+
from gismap.utils.requests import session
|
|
9
|
+
from gismap.sources.dblp import DBLP_TYPES
|
|
10
|
+
|
|
11
|
+
key_re = r'<https://dblp.org/rec/([^>]+)>'
|
|
12
|
+
title_re = r'.*?dblp:title\s+"([^"]+)"'
|
|
13
|
+
type_re = r'.*?dblp:bibtexType\s+bibtex:(\w+)'
|
|
14
|
+
authors_re = r'.*?dblp:hasSignature\s+(\[.*\])\s*;'
|
|
15
|
+
url_re = r'(?:.*?dblp:primaryDocumentPage <([^>]+)>)?'
|
|
16
|
+
stream_re = r'(?:.*?dblp:publishedInStream ([^;]+) ;)?'
|
|
17
|
+
pages_re = r'(?:.*?dblp:pagination "([^"]+)")?'
|
|
18
|
+
venue_re = r'(?:.*?dblp:publishedIn\s+"([^"]+?)")?'
|
|
19
|
+
year_re = r'.*?"(\d{4})"\^\^<http://www.w3.org/2001/XMLSchema#gYear>'
|
|
20
|
+
|
|
21
|
+
pub_re = re.compile("".join([key_re, title_re, type_re, authors_re,
|
|
22
|
+
url_re, stream_re, pages_re, venue_re, year_re]), flags=re.S)
|
|
23
|
+
|
|
24
|
+
streams_re = re.compile(r'<https://dblp.org/streams/((?:conf|journals)/[^>]+)>')
|
|
25
|
+
|
|
26
|
+
authid_re = re.compile(
|
|
27
|
+
r'\[.*?signatureDblpName\s*?"([^"]+?)(?:\s+\d+)?".*?signatureCreator\s*<https://dblp.org/pid/([^>]+?)>.*?]',
|
|
28
|
+
flags=re.S)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_block(dblp_block):
|
|
32
|
+
"""
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
dblp_block: :class:`str`
|
|
36
|
+
A DBLP publication, turtle format.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
key: :class:`str`
|
|
41
|
+
DBLP key.
|
|
42
|
+
title: :class:`str`
|
|
43
|
+
Publication title.
|
|
44
|
+
type: :class:`str`
|
|
45
|
+
Type of publication.
|
|
46
|
+
authors: :class:`dict`
|
|
47
|
+
Publication authors (key -> name)
|
|
48
|
+
url: :class:`str` or :class:`NoneType`
|
|
49
|
+
Publication URL.
|
|
50
|
+
stream: :class:`list` or :class:`NoneType`
|
|
51
|
+
Publication streams (normalized journal/conf).
|
|
52
|
+
pages: :class:`str` or :class:`NoneType`
|
|
53
|
+
Publication pages.
|
|
54
|
+
venue: :class:`str` or :class:`NoneType`
|
|
55
|
+
Publication venue (conf/journal).
|
|
56
|
+
year: :class:`int`
|
|
57
|
+
Year of publication.
|
|
58
|
+
"""
|
|
59
|
+
items = pub_re.search(dblp_block)
|
|
60
|
+
if items is None:
|
|
61
|
+
return None
|
|
62
|
+
key, title, typ, authors, url, stream, pages, venue, year = items.groups()
|
|
63
|
+
typ = typ.lower()
|
|
64
|
+
typ = DBLP_TYPES.get(typ, typ)
|
|
65
|
+
if stream:
|
|
66
|
+
stream = streams_re.findall(stream)
|
|
67
|
+
authors = {i: n for n, i in authid_re.findall(authors)}
|
|
68
|
+
if authors:
|
|
69
|
+
return key, title, typ, authors, url, stream, pages, venue, int(year)
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@contextmanager
|
|
74
|
+
def get_stream(source, chunk_size=1024 * 64):
|
|
75
|
+
"""
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
source: :class:`str` or :class:`~pathlib.Path`
|
|
79
|
+
Where the content. Can be on a local file or on the Internet.
|
|
80
|
+
chunk_size: :class:`int`, optional
|
|
81
|
+
Desired chunk size. For streaming gz content, must be a multiple of 32kB.
|
|
82
|
+
|
|
83
|
+
Yields
|
|
84
|
+
-------
|
|
85
|
+
iterable
|
|
86
|
+
Chunk iterator that streams the content.
|
|
87
|
+
:class:`int`
|
|
88
|
+
Source size (used later to compute ETA).
|
|
89
|
+
"""
|
|
90
|
+
if isinstance(source, str) and source.startswith("https://"):
|
|
91
|
+
# URL HTTP
|
|
92
|
+
with session.get(source, stream=True) as r:
|
|
93
|
+
r.raise_for_status()
|
|
94
|
+
total = int(r.headers.get("content-length", 0)) or None
|
|
95
|
+
yield r.iter_content(chunk_size=chunk_size), total
|
|
96
|
+
else:
|
|
97
|
+
source = Path(source)
|
|
98
|
+
if not source.exists():
|
|
99
|
+
yield [], 0
|
|
100
|
+
return None
|
|
101
|
+
total = source.stat().st_size
|
|
102
|
+
with source.open("rb") as file_handle:
|
|
103
|
+
def read_chunks():
|
|
104
|
+
while True:
|
|
105
|
+
chunk = file_handle.read(chunk_size)
|
|
106
|
+
if not chunk:
|
|
107
|
+
break
|
|
108
|
+
yield chunk
|
|
109
|
+
yield read_chunks(), total
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def publis_streamer(source, chunk_size=1024 * 64, encoding="unicode_escape"):
|
|
113
|
+
"""
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
source: :class:`str` or :class:`~pathlib.Path`
|
|
117
|
+
Where the DBLP turtle content is. Can be on a local file or on the Internet.
|
|
118
|
+
chunk_size: :class:`int`, optional
|
|
119
|
+
Desired chunk size. Must be a multiple of 32kB.
|
|
120
|
+
encoding: :class:`str`, default=unicode_escape
|
|
121
|
+
Encoding of stream.
|
|
122
|
+
|
|
123
|
+
Yields
|
|
124
|
+
-------
|
|
125
|
+
key: :class:`str`
|
|
126
|
+
DBLP key.
|
|
127
|
+
title: :class:`str`
|
|
128
|
+
Publication title.
|
|
129
|
+
type: :class:`str`
|
|
130
|
+
Type of publication.
|
|
131
|
+
authors: :class:`dict`
|
|
132
|
+
Publication authors (key -> name).
|
|
133
|
+
venue: :class:`str`
|
|
134
|
+
Publication venue (conf/journal).
|
|
135
|
+
year: :class:`int`
|
|
136
|
+
Year of publication.
|
|
137
|
+
"""
|
|
138
|
+
with get_stream(source, chunk_size=chunk_size) as (stream, total):
|
|
139
|
+
with tqdm(total=total, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing") as pbar:
|
|
140
|
+
decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
141
|
+
text_buffer = ""
|
|
142
|
+
for chunk in stream:
|
|
143
|
+
if not chunk:
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
pbar.update(len(chunk))
|
|
147
|
+
data = decomp.decompress(chunk)
|
|
148
|
+
if not data:
|
|
149
|
+
continue
|
|
150
|
+
text_buffer += data.decode(encoding, errors="replace")
|
|
151
|
+
|
|
152
|
+
blocks = text_buffer.split("\n\n")
|
|
153
|
+
text_buffer = blocks[-1]
|
|
154
|
+
for block in blocks[:-1]:
|
|
155
|
+
pub = parse_block(block)
|
|
156
|
+
if pub:
|
|
157
|
+
yield pub
|
|
158
|
+
|
|
159
|
+
data = decomp.flush()
|
|
160
|
+
if data:
|
|
161
|
+
text_buffer += data.decode(encoding, errors="replace")
|
|
162
|
+
|
|
163
|
+
if text_buffer:
|
|
164
|
+
blocks = text_buffer.split("\n\n")
|
|
165
|
+
for block in blocks:
|
|
166
|
+
pub = parse_block(block)
|
|
167
|
+
if pub:
|
|
168
|
+
yield pub
|
gismap/sources/hal.py
CHANGED
|
@@ -44,8 +44,7 @@ class HAL(DB):
|
|
|
44
44
|
>>> HAL.search_author("Ana Busic")
|
|
45
45
|
[HALAuthor(name='Ana Busic', key='anabusic')]
|
|
46
46
|
>>> HAL.search_author("Potop-Butucaru Maria") # doctest: +NORMALIZE_WHITESPACE
|
|
47
|
-
[HALAuthor(name='Potop-Butucaru Maria', key='
|
|
48
|
-
HALAuthor(name='Potop-Butucaru Maria', key='841868', key_type='pid')]
|
|
47
|
+
[HALAuthor(name='Potop-Butucaru Maria', key='841868', key_type='pid')]
|
|
49
48
|
>>> diego = HAL.search_author("Diego Perino")
|
|
50
49
|
>>> diego # doctest: +NORMALIZE_WHITESPACE
|
|
51
50
|
[HALAuthor(name='Diego Perino', key='847558', key_type='pid'),
|
|
@@ -122,8 +121,8 @@ class HAL(DB):
|
|
|
122
121
|
>>> diego = publications[2].authors[3]
|
|
123
122
|
>>> diego
|
|
124
123
|
HALAuthor(name='Diego Perino', key='Diego Perino', key_type='fullname')
|
|
125
|
-
>>> len(diego.get_publications())
|
|
126
|
-
|
|
124
|
+
>>> len(diego.get_publications()) > 28
|
|
125
|
+
True
|
|
127
126
|
>>> publications[-7] # doctest: +NORMALIZE_WHITESPACE
|
|
128
127
|
HALPublication(title='Upper bounds for stabilization in acyclic preference-based systems',
|
|
129
128
|
authors=[HALAuthor(name='Fabien Mathieu', key='fabien-mathieu')],
|
|
@@ -134,12 +133,12 @@ class HAL(DB):
|
|
|
134
133
|
|
|
135
134
|
>>> maria = HAL.search_author('Maria Potop-Butucaru')
|
|
136
135
|
>>> maria # doctest: +NORMALIZE_WHITESPACE
|
|
137
|
-
[HALAuthor(name='Maria Potop-Butucaru', key='
|
|
138
|
-
|
|
139
|
-
>>>
|
|
140
|
-
|
|
141
|
-
>>> len(maria[
|
|
142
|
-
|
|
136
|
+
[HALAuthor(name='Maria Potop-Butucaru', key='841868', key_type='pid')]
|
|
137
|
+
>>> n_pubs = len(HAL.from_author(maria[0]))
|
|
138
|
+
>>> n_pubs > 200
|
|
139
|
+
True
|
|
140
|
+
>>> n_pubs == len(maria[0].get_publications())
|
|
141
|
+
True
|
|
143
142
|
|
|
144
143
|
Note: an error is raised if not enough data is provided
|
|
145
144
|
|
|
@@ -173,6 +172,9 @@ class HAL(DB):
|
|
|
173
172
|
r = get(api, params=params)
|
|
174
173
|
response = json.loads(r)["response"]
|
|
175
174
|
res = [HALPublication.from_json(r) for r in response.get("docs", [])]
|
|
175
|
+
if len(res) == 0 and a.key_type != "fullname":
|
|
176
|
+
name = a.name
|
|
177
|
+
return HAL.from_author(HALAuthor(name=name, key=name, key_type="fullname"))
|
|
176
178
|
return res
|
|
177
179
|
|
|
178
180
|
|
|
@@ -185,6 +187,13 @@ class HALAuthor(Author, HAL):
|
|
|
185
187
|
_img: str = None
|
|
186
188
|
_cv: bool = None
|
|
187
189
|
|
|
190
|
+
def __post_init__(self):
|
|
191
|
+
if self.key and self.key_type is None:
|
|
192
|
+
if self.key.isdigit():
|
|
193
|
+
self.key_type = "pid"
|
|
194
|
+
if " " in self.key:
|
|
195
|
+
self.key_type = "fullname"
|
|
196
|
+
|
|
188
197
|
def check_cv(self):
|
|
189
198
|
if self.key_type is not None:
|
|
190
199
|
self._cv = False
|