gismap 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gismap/__init__.py +20 -0
- gismap/author.py +0 -0
- gismap/gismap.py +1 -0
- gismap/gismo.py +379 -0
- gismap/lab/__init__.py +10 -0
- gismap/lab/graph.py +234 -0
- gismap/lab/lab.py +152 -0
- gismap/lab/lip6.py +43 -0
- gismap/lab/toulouse.py +47 -0
- gismap/lab/vis.py +171 -0
- gismap/search.py +215 -0
- gismap/sources/__init__.py +0 -0
- gismap/sources/dblp.py +162 -0
- gismap/sources/hal.py +272 -0
- gismap/sources/models.py +31 -0
- gismap/sources/multi.py +135 -0
- gismap/utils/__init__.py +0 -0
- gismap/utils/common.py +60 -0
- gismap/utils/logger.py +4 -0
- gismap/utils/requests.py +33 -0
- gismap/utils/text.py +93 -0
- gismap-0.1.0.dist-info/METADATA +62 -0
- gismap-0.1.0.dist-info/RECORD +25 -0
- gismap-0.1.0.dist-info/WHEEL +4 -0
- gismap-0.1.0.dist-info/licenses/AUTHORS.md +9 -0
gismap/sources/dblp.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from typing import ClassVar
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from urllib.parse import quote_plus
|
|
4
|
+
from bs4 import BeautifulSoup as Soup
|
|
5
|
+
from time import sleep
|
|
6
|
+
|
|
7
|
+
from gismap.sources.models import DB, Author, Publication
|
|
8
|
+
from gismap.utils.text import clean_aliases
|
|
9
|
+
from gismap.utils.requests import get
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(repr=False)
|
|
13
|
+
class DBLP(DB):
|
|
14
|
+
db_name: ClassVar[str] = "dblp"
|
|
15
|
+
author_backoff: ClassVar[float] = 7.0
|
|
16
|
+
publi_backoff: ClassVar[float] = 2.0
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def search_author(cls, name, wait=True):
|
|
20
|
+
"""
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
name: :class:`str`
|
|
24
|
+
People to find.
|
|
25
|
+
wait: :class:`bool`
|
|
26
|
+
Wait a bit to avoid 429.
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
:class:`list`
|
|
31
|
+
Potential matches.
|
|
32
|
+
|
|
33
|
+
Examples
|
|
34
|
+
--------
|
|
35
|
+
|
|
36
|
+
>>> fabien = DBLP.search_author("Fabien Mathieu")
|
|
37
|
+
>>> fabien
|
|
38
|
+
[DBLPAuthor(name='Fabien Mathieu', key='66/2077')]
|
|
39
|
+
>>> fabien[0].url
|
|
40
|
+
'https://dblp.org/pid/66/2077.html'
|
|
41
|
+
>>> manu = DBLP.search_author("Manuel Barragan")
|
|
42
|
+
>>> manu # doctest: +NORMALIZE_WHITESPACE
|
|
43
|
+
[DBLPAuthor(name='Manuel Barragan', key='07/10587'),
|
|
44
|
+
DBLPAuthor(name='Manuel Barragan', key='83/3865'),
|
|
45
|
+
DBLPAuthor(name='Manuel Barragan', key='188/0198')]
|
|
46
|
+
>>> DBLP.search_author("NotaSearcherName", wait=False)
|
|
47
|
+
[]
|
|
48
|
+
"""
|
|
49
|
+
dblp_api = "https://dblp.org/search/author/api"
|
|
50
|
+
dblp_args = {"q": name}
|
|
51
|
+
r = get(dblp_api, params=dblp_args)
|
|
52
|
+
soup = Soup(r, features="xml")
|
|
53
|
+
if wait:
|
|
54
|
+
sleep(cls.author_backoff)
|
|
55
|
+
return [
|
|
56
|
+
DBLPAuthor(
|
|
57
|
+
name=name,
|
|
58
|
+
key=hit.url.text.split("pid/")[1],
|
|
59
|
+
aliases=clean_aliases(
|
|
60
|
+
name, [hit.author.text] + [alia.text for alia in hit("alias")]
|
|
61
|
+
),
|
|
62
|
+
)
|
|
63
|
+
for hit in soup("hit")
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_author(cls, a, wait=True):
|
|
68
|
+
"""
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
:class:`list`
|
|
72
|
+
Papers available in DBLP.
|
|
73
|
+
wait: :class:`bool`
|
|
74
|
+
Wait a bit to avoid 429.
|
|
75
|
+
|
|
76
|
+
Examples
|
|
77
|
+
--------
|
|
78
|
+
|
|
79
|
+
>>> fabien = DBLPAuthor('Fabien Mathieu', key='66/2077')
|
|
80
|
+
>>> publications = sorted(DBLP.from_author(fabien),
|
|
81
|
+
... key=lambda p: p.title)
|
|
82
|
+
>>> publications[0] # doctest: +NORMALIZE_WHITESPACE
|
|
83
|
+
DBLPPublication(title='Achievable catalog size in peer-to-peer video-on-demand systems.',
|
|
84
|
+
authors=[DBLPAuthor(name='Yacine Boufkhad', key='75/5742'), DBLPAuthor(name='Fabien Mathieu', key='66/2077'),
|
|
85
|
+
DBLPAuthor(name='Fabien de Montgolfier', key='57/6313'), DBLPAuthor(name='Diego Perino', key='03/3645'),
|
|
86
|
+
DBLPAuthor(name='Laurent Viennot', key='v/LaurentViennot')],
|
|
87
|
+
venue='IPTPS', type='conference', year=2008, key='conf/iptps/BoufkhadMMPV08',
|
|
88
|
+
url='https://dblp.org/rec/conf/iptps/BoufkhadMMPV08.html', pages=4)
|
|
89
|
+
>>> publications[-1] # doctest: +NORMALIZE_WHITESPACE
|
|
90
|
+
DBLPPublication(title='Upper Bounds for Stabilization in Acyclic Preference-Based Systems.',
|
|
91
|
+
authors=[DBLPAuthor(name='Fabien Mathieu', key='66/2077')], venue='SSS', type='conference', year=2007,
|
|
92
|
+
key='conf/sss/Mathieu07', url='https://dblp.org/rec/conf/sss/Mathieu07.html', pages='372-382')
|
|
93
|
+
"""
|
|
94
|
+
r = get(f"https://dblp.org/pid/{a.key}.xml")
|
|
95
|
+
soup = Soup(r, features="xml")
|
|
96
|
+
if wait:
|
|
97
|
+
sleep(cls.author_backoff)
|
|
98
|
+
res = [DBLPPublication.from_soup(r) for r in soup("r")]
|
|
99
|
+
return [p for p in res if p.authors]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass(repr=False)
|
|
103
|
+
class DBLPAuthor(Author, DBLP):
|
|
104
|
+
key: str
|
|
105
|
+
aliases: list = field(default_factory=list)
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def url(self):
|
|
109
|
+
if self.key:
|
|
110
|
+
return f"https://dblp.org/pid/{self.key}.html"
|
|
111
|
+
return f"https://dblp.org/search?q={quote_plus(self.name)}"
|
|
112
|
+
|
|
113
|
+
def get_publications(self, wait=False):
|
|
114
|
+
return DBLP.from_author(self, wait=wait)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
DBLP_TYPES = {
|
|
118
|
+
"article": "journal",
|
|
119
|
+
"inproceedings": "conference",
|
|
120
|
+
"proceedings": "book",
|
|
121
|
+
"informal": "report",
|
|
122
|
+
"phdthesis": "thesis",
|
|
123
|
+
"habil": "hdr",
|
|
124
|
+
"software": "software",
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass(repr=False)
|
|
129
|
+
class DBLPPublication(Publication, DBLP):
|
|
130
|
+
key: str
|
|
131
|
+
url: str = None
|
|
132
|
+
pages: str = None
|
|
133
|
+
volume: int = None
|
|
134
|
+
number: int = None
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def from_soup(cls, soup):
|
|
138
|
+
p = soup.find()
|
|
139
|
+
typ = p.get("publtype", p.name)
|
|
140
|
+
typ = DBLP_TYPES.get(typ, typ)
|
|
141
|
+
res = {
|
|
142
|
+
"type": typ,
|
|
143
|
+
"key": p["key"],
|
|
144
|
+
"url": f"https://dblp.org/rec/{p['key']}.html",
|
|
145
|
+
}
|
|
146
|
+
keys = ["title", "booktitle", "pages", "journal", "year", "volume", "number"]
|
|
147
|
+
for tag in keys:
|
|
148
|
+
t = p.find(tag)
|
|
149
|
+
if t:
|
|
150
|
+
try:
|
|
151
|
+
res[tag] = int(t.text)
|
|
152
|
+
except ValueError:
|
|
153
|
+
res[tag] = t.text
|
|
154
|
+
for tag in ["booktitle", "journal"]:
|
|
155
|
+
t = p.find(tag)
|
|
156
|
+
if t:
|
|
157
|
+
res["venue"] = t.text
|
|
158
|
+
break
|
|
159
|
+
else:
|
|
160
|
+
res["venue"] = "unpublished"
|
|
161
|
+
res["authors"] = [DBLPAuthor(key=a["pid"], name=a.text) for a in p("author")]
|
|
162
|
+
return cls(**{k: v for k, v in res.items() if k in cls.__match_args__})
|
gismap/sources/hal.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
from typing import ClassVar
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from urllib.parse import quote_plus
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from gismap.sources.models import DB, Publication, Author # DBAuthor, DBPublication
|
|
8
|
+
from gismap.utils.text import clean_aliases
|
|
9
|
+
from gismap.utils.requests import get
|
|
10
|
+
from gismap.utils.common import unlist
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(repr=False)
|
|
14
|
+
class HAL(DB):
|
|
15
|
+
db_name: ClassVar[str] = "hal"
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def search_author(cls, name):
|
|
19
|
+
"""
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
name: :class:`str`
|
|
23
|
+
People to find.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
:class:`list`
|
|
28
|
+
Potential matches.
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
|
|
33
|
+
>>> fabien = HAL.search_author("Fabien Mathieu")
|
|
34
|
+
>>> fabien
|
|
35
|
+
[HALAuthor(name='Fabien Mathieu', key='fabien-mathieu')]
|
|
36
|
+
>>> fabien = fabien[0]
|
|
37
|
+
>>> fabien.url
|
|
38
|
+
'https://hal.science/search/index/?q=*&authIdHal_s=fabien-mathieu'
|
|
39
|
+
>>> HAL.search_author("Laurent Viennot")[0]
|
|
40
|
+
HALAuthor(name='Laurent Viennot', key='laurentviennot')
|
|
41
|
+
>>> HAL.search_author("NotaSearcherName")
|
|
42
|
+
[]
|
|
43
|
+
>>> HAL.search_author("Ana Busic")
|
|
44
|
+
[HALAuthor(name='Ana Busic', key='anabusic')]
|
|
45
|
+
>>> HAL.search_author("Potop-Butucaru Maria") # doctest: +NORMALIZE_WHITESPACE
|
|
46
|
+
[HALAuthor(name='Potop-Butucaru Maria', key='858256', key_type='pid'),
|
|
47
|
+
HALAuthor(name='Potop-Butucaru Maria', key='841868', key_type='pid')]
|
|
48
|
+
>>> diego = HAL.search_author("Diego Perino")
|
|
49
|
+
>>> diego # doctest: +NORMALIZE_WHITESPACE
|
|
50
|
+
[HALAuthor(name='Diego Perino', key='847558', key_type='pid'),
|
|
51
|
+
HALAuthor(name='Diego Perino', key='978810', key_type='pid')]
|
|
52
|
+
>>> diego[1].url
|
|
53
|
+
'https://hal.science/search/index/?q=*&authIdPerson_i=978810'
|
|
54
|
+
"""
|
|
55
|
+
hal_api = "https://api.archives-ouvertes.fr/ref/author/"
|
|
56
|
+
fields = ",".join(["label_s", "idHal_s", "person_i", "fullName_s"])
|
|
57
|
+
hal_args = {"q": name, "fl": fields, "wt": "json"}
|
|
58
|
+
r = get(hal_api, params=hal_args)
|
|
59
|
+
response = json.loads(r)["response"]
|
|
60
|
+
hids = defaultdict(set)
|
|
61
|
+
pids = defaultdict(set)
|
|
62
|
+
names = set()
|
|
63
|
+
for a in response.get("docs", []):
|
|
64
|
+
if "label_s" in a:
|
|
65
|
+
if "idHal_s" in a:
|
|
66
|
+
hids[a["idHal_s"]].add(a.get("label_s"))
|
|
67
|
+
elif "person_i" in a:
|
|
68
|
+
pids[a["person_i"]].add(a.get("label_s"))
|
|
69
|
+
elif "fullName_s" in a:
|
|
70
|
+
names.add(a["fullName_s"])
|
|
71
|
+
res = [
|
|
72
|
+
HALAuthor(name=name, key=k, aliases=clean_aliases(name, v))
|
|
73
|
+
for k, v in hids.items()
|
|
74
|
+
] + [
|
|
75
|
+
HALAuthor(
|
|
76
|
+
name=name, key=str(k), aliases=clean_aliases(name, v), key_type="pid"
|
|
77
|
+
)
|
|
78
|
+
for k, v in pids.items()
|
|
79
|
+
]
|
|
80
|
+
return (
|
|
81
|
+
res
|
|
82
|
+
if res
|
|
83
|
+
else [
|
|
84
|
+
HALAuthor(
|
|
85
|
+
name=name,
|
|
86
|
+
key=fullname,
|
|
87
|
+
aliases=clean_aliases(name, fullname),
|
|
88
|
+
key_type="fullname",
|
|
89
|
+
)
|
|
90
|
+
for fullname in names
|
|
91
|
+
]
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def from_author(cls, a):
|
|
96
|
+
"""
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
a: :class:`~gismap.sources.hal.HALAuthor`
|
|
100
|
+
Hal researcher.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
:class:`list`
|
|
105
|
+
Papers available in HAL.
|
|
106
|
+
|
|
107
|
+
Examples
|
|
108
|
+
--------
|
|
109
|
+
|
|
110
|
+
>>> fabien = HAL.search_author("Fabien Mathieu")[0]
|
|
111
|
+
>>> publications = sorted(fabien.get_publications(), key=lambda p: p.title)
|
|
112
|
+
>>> publications[2] # doctest: +NORMALIZE_WHITESPACE
|
|
113
|
+
HALPublication(title='Achievable Catalog Size in Peer-to-Peer Video-on-Demand Systems',
|
|
114
|
+
authors=[HALAuthor(name='Yacine Boufkhad', key='yacine-boufkhad'),
|
|
115
|
+
HALAuthor(name='Fabien Mathieu', key='fabien-mathieu'),
|
|
116
|
+
HALAuthor(name='Fabien de Montgolfier', key='949013', key_type='pid'),
|
|
117
|
+
HALAuthor(name='Diego Perino', key='Diego Perino', key_type='fullname'),
|
|
118
|
+
HALAuthor(name='Laurent Viennot', key='laurentviennot')],
|
|
119
|
+
venue='Proceedings of the 7th Internnational Workshop on Peer-to-Peer Systems (IPTPS)', type='conference',
|
|
120
|
+
year=2008, key='471724', url='https://inria.hal.science/inria-00471724v1')
|
|
121
|
+
>>> diego = publications[2].authors[3]
|
|
122
|
+
>>> diego
|
|
123
|
+
HALAuthor(name='Diego Perino', key='Diego Perino', key_type='fullname')
|
|
124
|
+
>>> len(diego.get_publications())
|
|
125
|
+
28
|
|
126
|
+
>>> publications[-7] # doctest: +NORMALIZE_WHITESPACE
|
|
127
|
+
HALPublication(title='Upper bounds for stabilization in acyclic preference-based systems',
|
|
128
|
+
authors=[HALAuthor(name='Fabien Mathieu', key='fabien-mathieu')],
|
|
129
|
+
venue="SSS'07 - 9th international conference on Stabilization, Safety, and Security of Distributed Systems",
|
|
130
|
+
type='conference', year=2007, key='668356', url='https://inria.hal.science/hal-00668356v1')
|
|
131
|
+
|
|
132
|
+
Case of someone with multiple ids one want to cumulate:
|
|
133
|
+
|
|
134
|
+
>>> maria = HAL.search_author('Maria Potop-Butucaru')
|
|
135
|
+
>>> maria # doctest: +NORMALIZE_WHITESPACE
|
|
136
|
+
[HALAuthor(name='Maria Potop-Butucaru', key='858256', key_type='pid'),
|
|
137
|
+
HALAuthor(name='Maria Potop-Butucaru', key='841868', key_type='pid')]
|
|
138
|
+
>>> len(HAL.from_author(maria[0]))
|
|
139
|
+
26
|
|
140
|
+
>>> len(maria[1].get_publications())
|
|
141
|
+
123
|
|
142
|
+
|
|
143
|
+
Note: an error is raised if not enough data is provided
|
|
144
|
+
|
|
145
|
+
>>> HAL.from_author(HALAuthor('Fabien Mathieu'))
|
|
146
|
+
Traceback (most recent call last):
|
|
147
|
+
...
|
|
148
|
+
ValueError: HALAuthor(name='Fabien Mathieu') must have a key for publications to be fetched.
|
|
149
|
+
"""
|
|
150
|
+
api = "https://api.archives-ouvertes.fr/search/"
|
|
151
|
+
fields = [
|
|
152
|
+
"docid",
|
|
153
|
+
"abstract_s",
|
|
154
|
+
"label_s",
|
|
155
|
+
"uri_s",
|
|
156
|
+
"*Title_s",
|
|
157
|
+
"title_s",
|
|
158
|
+
"producedDateY_i",
|
|
159
|
+
"auth_s",
|
|
160
|
+
"authFullNamePersonIDIDHal_fs",
|
|
161
|
+
"docType_s",
|
|
162
|
+
]
|
|
163
|
+
params = {"fl": fields, "rows": 2000, "wt": "json"}
|
|
164
|
+
if a.key is None:
|
|
165
|
+
raise ValueError(f"{a} must have a key for publications to be fetched.")
|
|
166
|
+
if a.key_type == "pid":
|
|
167
|
+
params["q"] = f"authIdPerson_i:{a.key}"
|
|
168
|
+
elif a.key_type == "fullname":
|
|
169
|
+
params["q"] = f'authFullName_s:"{a.key}"'
|
|
170
|
+
else:
|
|
171
|
+
params["q"] = f"authIdHal_s:{a.key}"
|
|
172
|
+
r = get(api, params=params)
|
|
173
|
+
response = json.loads(r)["response"]
|
|
174
|
+
res = [HALPublication.from_json(r) for r in response.get("docs", [])]
|
|
175
|
+
return res
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@dataclass(repr=False)
|
|
179
|
+
class HALAuthor(Author, HAL):
|
|
180
|
+
key: str | int = None
|
|
181
|
+
key_type: str = None
|
|
182
|
+
aliases: list = field(default_factory=list)
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def url(self):
|
|
186
|
+
if self.key_type == "pid":
|
|
187
|
+
return f"https://hal.science/search/index/?q=*&authIdPerson_i={self.key}"
|
|
188
|
+
elif self.key_type == "fullname":
|
|
189
|
+
return f"https://hal.science/search/index?q={quote_plus(self.name)}"
|
|
190
|
+
else:
|
|
191
|
+
return f"https://hal.science/search/index/?q=*&authIdHal_s={self.key}"
|
|
192
|
+
|
|
193
|
+
def get_publications(self):
|
|
194
|
+
return HAL.from_author(self)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def parse_facet_author(a):
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
Parameters
|
|
201
|
+
----------
|
|
202
|
+
a: :class:`str`
|
|
203
|
+
Hal facet of author
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
:class:`~gismap.sources.hal.HALAuthor`
|
|
208
|
+
|
|
209
|
+
"""
|
|
210
|
+
name, pid, hid = a.split("_FacetSep_")
|
|
211
|
+
if hid:
|
|
212
|
+
return HALAuthor(name=name, key=hid)
|
|
213
|
+
elif pid and int(pid):
|
|
214
|
+
return HALAuthor(name=name, key=pid, key_type="pid")
|
|
215
|
+
else:
|
|
216
|
+
return HALAuthor(name=name, key=name, key_type="fullname")
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
HAL_TYPES = {
|
|
220
|
+
"ART": "journal",
|
|
221
|
+
"COMM": "conference",
|
|
222
|
+
"OUV": "book",
|
|
223
|
+
"COUV": "chapter",
|
|
224
|
+
"THESE": "thesis",
|
|
225
|
+
"UNDEFINED": "report",
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
HAL_KEYS = {
|
|
229
|
+
"title_s": "title",
|
|
230
|
+
"abstract_s": "abstract",
|
|
231
|
+
"docid": "key",
|
|
232
|
+
"bookTitle_s": "booktitle",
|
|
233
|
+
"conferenceTitle_s": "conference",
|
|
234
|
+
"journalTitle_s": "journal",
|
|
235
|
+
"docType_s": "type",
|
|
236
|
+
"producedDateY_i": "year",
|
|
237
|
+
"uri_s": "url",
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@dataclass(repr=False)
|
|
242
|
+
class HALPublication(Publication, HAL):
|
|
243
|
+
key: str
|
|
244
|
+
abstract: str = None
|
|
245
|
+
url: str = None
|
|
246
|
+
|
|
247
|
+
@classmethod
|
|
248
|
+
def from_json(cls, r):
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
Parameters
|
|
252
|
+
----------
|
|
253
|
+
r: :class:`dict`
|
|
254
|
+
De-serialized JSON.
|
|
255
|
+
|
|
256
|
+
Returns
|
|
257
|
+
-------
|
|
258
|
+
:class:`~gismap.sources.hal.HALPublication`
|
|
259
|
+
|
|
260
|
+
"""
|
|
261
|
+
res = {v: unlist(r[k]) for k, v in HAL_KEYS.items() if k in r}
|
|
262
|
+
res["authors"] = [
|
|
263
|
+
parse_facet_author(a) for a in r.get("authFullNamePersonIDIDHal_fs", [])
|
|
264
|
+
]
|
|
265
|
+
for tag in ["booktitle", "journal", "conference"]:
|
|
266
|
+
if tag in res:
|
|
267
|
+
res["venue"] = res[tag]
|
|
268
|
+
break
|
|
269
|
+
else:
|
|
270
|
+
res["venue"] = "unpublished"
|
|
271
|
+
res["type"] = HAL_TYPES.get(res["type"], res["type"].lower())
|
|
272
|
+
return cls(**{k: v for k, v in res.items() if k in cls.__match_args__})
|
gismap/sources/models.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import ClassVar
|
|
3
|
+
|
|
4
|
+
from gismap.utils.common import LazyRepr
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(repr=False)
|
|
8
|
+
class Author(LazyRepr):
|
|
9
|
+
name: str
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(repr=False)
|
|
13
|
+
class Publication(LazyRepr):
|
|
14
|
+
title: str
|
|
15
|
+
authors: list
|
|
16
|
+
venue: str
|
|
17
|
+
type: str
|
|
18
|
+
year: int
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(repr=False)
|
|
22
|
+
class DB(LazyRepr):
|
|
23
|
+
db_name: ClassVar[str] = None
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def search_author(cls, name):
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def from_author(cls, a):
|
|
31
|
+
raise NotImplementedError
|
gismap/sources/multi.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from bof.fuzz import Process
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from gismap.sources.models import Publication, Author
|
|
6
|
+
from gismap.utils.text import clean_aliases
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
score_rosetta = {
|
|
10
|
+
"db_name": {"dblp": 1, "hal": 2},
|
|
11
|
+
"venue": {"CoRR": -1, "unpublished": -2},
|
|
12
|
+
"type": {"conference": 1, "journal": 2},
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(repr=False)
|
|
17
|
+
class SourcedAuthor(Author):
|
|
18
|
+
sources: list = field(default_factory=list)
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def key(self):
|
|
22
|
+
if self.sources:
|
|
23
|
+
return self.sources[0].key
|
|
24
|
+
else:
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def aliases(self):
|
|
29
|
+
if self.sources:
|
|
30
|
+
return clean_aliases(
|
|
31
|
+
self.name, [n for a in self.sources for n in [a.name] + a.aliases]
|
|
32
|
+
)
|
|
33
|
+
else:
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_sources(cls, sources):
|
|
38
|
+
return cls(name=sources[0].name, sources=sources)
|
|
39
|
+
|
|
40
|
+
def get_publications(self, clean=True):
|
|
41
|
+
res = {p.key: p for a in self.sources for p in a.get_publications()}
|
|
42
|
+
if clean:
|
|
43
|
+
regroup_authors({self.key: self}, res)
|
|
44
|
+
return regroup_publications(res)
|
|
45
|
+
else:
|
|
46
|
+
return res
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(repr=False)
|
|
50
|
+
class SourcedPublication(Publication):
|
|
51
|
+
key: str
|
|
52
|
+
sources: list = field(default_factory=list)
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_sources(cls, sources):
|
|
56
|
+
sources = sorted(sources, key=cls.score_source, reverse=True)
|
|
57
|
+
main = sources[0]
|
|
58
|
+
res = cls(
|
|
59
|
+
**{k: getattr(main, k) for k in main.__dict__ if k in cls.__match_args__},
|
|
60
|
+
sources=sources,
|
|
61
|
+
)
|
|
62
|
+
for k, v in main.__dict__.items():
|
|
63
|
+
if k not in cls.__match_args__:
|
|
64
|
+
setattr(res, k, v)
|
|
65
|
+
return res
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def score_source(source):
|
|
69
|
+
scores = [v.get(getattr(source, k, None), 0) for k, v in score_rosetta.items()]
|
|
70
|
+
scores.append(source.year)
|
|
71
|
+
return tuple(scores)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def regroup_authors(auth_dict, pub_dict):
|
|
75
|
+
"""
|
|
76
|
+
Replace authors of publications with matching authors.
|
|
77
|
+
Typical use: upgrade DB-specific authors to multisource authors.
|
|
78
|
+
|
|
79
|
+
Replacement is in place.
|
|
80
|
+
|
|
81
|
+
Parameters
|
|
82
|
+
----------
|
|
83
|
+
auth_dict: :class:`dict`
|
|
84
|
+
Authors to unify.
|
|
85
|
+
pub_dict: :class:`dict`
|
|
86
|
+
Publications to unify.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
None
|
|
91
|
+
"""
|
|
92
|
+
redirection = {
|
|
93
|
+
k: a
|
|
94
|
+
for a in auth_dict.values()
|
|
95
|
+
for s in a.sources
|
|
96
|
+
for k in [s.key, s.name, *s.aliases]
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
for pub in pub_dict.values():
|
|
100
|
+
pub.authors = [redirection.get(a.key, a) for a in pub.authors]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def regroup_publications(pub_dict, threshold=90, length_impact=0.2):
|
|
104
|
+
"""
|
|
105
|
+
Puts together copies of the same publication.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
pub_dict: :class:`dict`
|
|
110
|
+
Publications to unify.
|
|
111
|
+
threshold: float
|
|
112
|
+
Similarity parameter.
|
|
113
|
+
length_impact: float
|
|
114
|
+
Length impact parameter.
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
:class:`dict`
|
|
119
|
+
Unified publications.
|
|
120
|
+
"""
|
|
121
|
+
pub_list = [p for p in pub_dict.values()]
|
|
122
|
+
|
|
123
|
+
p = Process(length_impact=length_impact)
|
|
124
|
+
p.fit([paper.title for paper in pub_list])
|
|
125
|
+
|
|
126
|
+
res = dict()
|
|
127
|
+
done = np.zeros(len(pub_list), dtype=bool)
|
|
128
|
+
for i, paper in enumerate(pub_list):
|
|
129
|
+
if done[i]:
|
|
130
|
+
continue
|
|
131
|
+
locs = np.where(p.transform([paper.title])[0, :] > threshold)[0]
|
|
132
|
+
pub = SourcedPublication.from_sources([pub_list[i] for i in locs])
|
|
133
|
+
res[pub.key] = pub
|
|
134
|
+
done[locs] = True
|
|
135
|
+
return res
|
gismap/utils/__init__.py
ADDED
|
File without changes
|
gismap/utils/common.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
HIDDEN_KEYS = {"sources", "aliases", "abstract"}
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LazyRepr:
|
|
5
|
+
"""
|
|
6
|
+
MixIn that hides empty fields in dataclasses repr's.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def __repr__(self):
|
|
10
|
+
kws = [
|
|
11
|
+
f"{key}={value!r}"
|
|
12
|
+
for key, value in self.__dict__.items()
|
|
13
|
+
if value and key not in HIDDEN_KEYS
|
|
14
|
+
]
|
|
15
|
+
return f"{type(self).__name__}({', '.join(kws)})"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def unlist(x):
|
|
19
|
+
"""
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
x: :class:`str` or :class:`list` or :class:`int`
|
|
23
|
+
Something.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
x: :class:`str` or :class:`int`
|
|
28
|
+
If it's a list, make it flat.
|
|
29
|
+
"""
|
|
30
|
+
return x[0] if (isinstance(x, list) and x) else x
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_classes(root, key="name"):
|
|
34
|
+
"""
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
root: :class:`class`
|
|
38
|
+
Starting class (can be abstract).
|
|
39
|
+
key: :class:`str`, default='name'
|
|
40
|
+
Attribute to look-up
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
:class:`dict`
|
|
45
|
+
Dictionaries of all subclasses that have a key attribute (as in class attribute `key`).
|
|
46
|
+
|
|
47
|
+
Examples
|
|
48
|
+
--------
|
|
49
|
+
|
|
50
|
+
>>> from gismap.sources.models import DB
|
|
51
|
+
>>> subclasses = get_classes(DB, key='db_name')
|
|
52
|
+
>>> dict(sorted(subclasses.items())) # doctest: +NORMALIZE_WHITESPACE
|
|
53
|
+
{'dblp': <class 'gismap.sources.dblp.DBLP'>, 'hal': <class 'gismap.sources.hal.HAL'>}
|
|
54
|
+
"""
|
|
55
|
+
result = {
|
|
56
|
+
getattr(c, key): c for c in root.__subclasses__() if getattr(c, key, None)
|
|
57
|
+
}
|
|
58
|
+
for c in root.__subclasses__():
|
|
59
|
+
result.update(get_classes(c))
|
|
60
|
+
return result
|
gismap/utils/logger.py
ADDED
gismap/utils/requests.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from time import sleep
|
|
2
|
+
import requests
|
|
3
|
+
from gismap.utils.logger import logger
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
session = requests.Session()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get(url, params=None):
|
|
10
|
+
"""
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
url: :class:`str`
|
|
14
|
+
Entry point to fetch.
|
|
15
|
+
params: :class:`dict`, optional
|
|
16
|
+
Get arguments (appended to URL).
|
|
17
|
+
|
|
18
|
+
Returns
|
|
19
|
+
-------
|
|
20
|
+
:class:`str`
|
|
21
|
+
Result.
|
|
22
|
+
"""
|
|
23
|
+
while True:
|
|
24
|
+
r = session.get(url, params=params)
|
|
25
|
+
if r.status_code == 429:
|
|
26
|
+
try:
|
|
27
|
+
t = int(r.headers["Retry-After"])
|
|
28
|
+
except KeyError:
|
|
29
|
+
t = 60
|
|
30
|
+
logger.warning(f"Too many requests. Auto-retry in {t} seconds.")
|
|
31
|
+
sleep(t)
|
|
32
|
+
else:
|
|
33
|
+
return r.text
|