gismap 0.2.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gismap/sources/dblp.py CHANGED
@@ -13,7 +13,7 @@ from gismap.utils.requests import get
13
13
  class DBLP(DB):
14
14
  db_name: ClassVar[str] = "dblp"
15
15
  author_backoff: ClassVar[float] = 5.0
16
- publi_backoff: ClassVar[float] = 1.0
16
+ publi_backoff: ClassVar[float] = 5.0
17
17
 
18
18
  @classmethod
19
19
  def search_author(cls, name, wait=True):
@@ -72,23 +72,6 @@ class DBLP(DB):
72
72
  Papers available in DBLP.
73
73
  wait: :class:`bool`
74
74
  Wait a bit to avoid 429.
75
-
76
- Examples
77
- --------
78
-
79
- >>> fabien = DBLPAuthor('Fabien Mathieu', key='66/2077')
80
- >>> publications = sorted(DBLP.from_author(fabien),
81
- ... key=lambda p: p.title)
82
- >>> publications[0] # doctest: +NORMALIZE_WHITESPACE
83
- DBLPPublication(title='Achievable catalog size in peer-to-peer video-on-demand systems.',
84
- authors=[DBLPAuthor(name='Yacine Boufkhad', key='75/5742'), DBLPAuthor(name='Fabien Mathieu', key='66/2077'),
85
- DBLPAuthor(name='Fabien de Montgolfier', key='57/6313'), DBLPAuthor(name='Diego Perino', key='03/3645'),
86
- DBLPAuthor(name='Laurent Viennot', key='v/LaurentViennot')],
87
- venue='IPTPS', type='conference', year=2008, key='conf/iptps/BoufkhadMMPV08')
88
- >>> publications[-1] # doctest: +NORMALIZE_WHITESPACE
89
- DBLPPublication(title='Upper Bounds for Stabilization in Acyclic Preference-Based Systems.',
90
- authors=[DBLPAuthor(name='Fabien Mathieu', key='66/2077')], venue='SSS', type='conference', year=2007,
91
- key='conf/sss/Mathieu07')
92
75
  """
93
76
  r = get(f"https://dblp.org/pid/{a.key}.xml")
94
77
  soup = Soup(r, features="xml")
@@ -100,6 +83,21 @@ class DBLP(DB):
100
83
 
101
84
  @dataclass(repr=False)
102
85
  class DBLPAuthor(Author, DBLP):
86
+ """
87
+ Examples
88
+ --------
89
+
90
+ >>> fabien = DBLPAuthor('Fabien Mathieu', key='66/2077')
91
+ >>> publications = sorted(fabien.get_publications(),
92
+ ... key=lambda p: p.title)
93
+ >>> publications[0].url # doctest: +NORMALIZE_WHITESPACE
94
+ 'https://dblp.org/rec/conf/iptps/BoufkhadMMPV08.html'
95
+ >>> publications[-1] # doctest: +NORMALIZE_WHITESPACE
96
+ DBLPPublication(title='Upper Bounds for Stabilization in Acyclic Preference-Based Systems.',
97
+ authors=[DBLPAuthor(name='Fabien Mathieu', key='66/2077')], venue='SSS', type='conference', year=2007,
98
+ key='conf/sss/Mathieu07')
99
+
100
+ """
103
101
  key: str
104
102
  aliases: list = field(default_factory=list)
105
103
 
@@ -0,0 +1,168 @@
1
+ import re
2
+ import zlib
3
+ from contextlib import contextmanager
4
+ from pathlib import Path
5
+
6
+ from tqdm.auto import tqdm
7
+
8
+ from gismap.utils.requests import session
9
+ from gismap.sources.dblp import DBLP_TYPES
10
+
11
+ key_re = r'<https://dblp.org/rec/([^>]+)>'
12
+ title_re = r'.*?dblp:title\s+"([^"]+)"'
13
+ type_re = r'.*?dblp:bibtexType\s+bibtex:(\w+)'
14
+ authors_re = r'.*?dblp:hasSignature\s+(\[.*\])\s*;'
15
+ url_re = r'(?:.*?dblp:primaryDocumentPage <([^>]+)>)?'
16
+ stream_re = r'(?:.*?dblp:publishedInStream ([^;]+) ;)?'
17
+ pages_re = r'(?:.*?dblp:pagination "([^"]+)")?'
18
+ venue_re = r'(?:.*?dblp:publishedIn\s+"([^"]+?)")?'
19
+ year_re = r'.*?"(\d{4})"\^\^<http://www.w3.org/2001/XMLSchema#gYear>'
20
+
21
+ pub_re = re.compile("".join([key_re, title_re, type_re, authors_re,
22
+ url_re, stream_re, pages_re, venue_re, year_re]), flags=re.S)
23
+
24
+ streams_re = re.compile(r'<https://dblp.org/streams/((?:conf|journals)/[^>]+)>')
25
+
26
+ authid_re = re.compile(
27
+ r'\[.*?signatureDblpName\s*?"([^"]+?)(?:\s+\d+)?".*?signatureCreator\s*<https://dblp.org/pid/([^>]+?)>.*?]',
28
+ flags=re.S)
29
+
30
+
31
+ def parse_block(dblp_block):
32
+ """
33
+ Parameters
34
+ ----------
35
+ dblp_block: :class:`str`
36
+ A DBLP publication, turtle format.
37
+
38
+ Returns
39
+ -------
40
+ key: :class:`str`
41
+ DBLP key.
42
+ title: :class:`str`
43
+ Publication title.
44
+ type: :class:`str`
45
+ Type of publication.
46
+ authors: :class:`dict`
47
+ Publication authors (key -> name)
48
+ url: :class:`str` or :class:`NoneType`
49
+ Publication URL.
50
+ stream: :class:`list` or :class:`NoneType`
51
+ Publication streams (normalized journal/conf).
52
+ pages: :class:`str` or :class:`NoneType`
53
+ Publication pages.
54
+ venue: :class:`str` or :class:`NoneType`
55
+ Publication venue (conf/journal).
56
+ year: :class:`int`
57
+ Year of publication.
58
+ """
59
+ items = pub_re.search(dblp_block)
60
+ if items is None:
61
+ return None
62
+ key, title, typ, authors, url, stream, pages, venue, year = items.groups()
63
+ typ = typ.lower()
64
+ typ = DBLP_TYPES.get(typ, typ)
65
+ if stream:
66
+ stream = streams_re.findall(stream)
67
+ authors = {i: n for n, i in authid_re.findall(authors)}
68
+ if authors:
69
+ return key, title, typ, authors, url, stream, pages, venue, int(year)
70
+ return None
71
+
72
+
73
+ @contextmanager
74
+ def get_stream(source, chunk_size=1024 * 64):
75
+ """
76
+ Parameters
77
+ ----------
78
+ source: :class:`str` or :class:`~pathlib.Path`
79
+ Where the content. Can be on a local file or on the Internet.
80
+ chunk_size: :class:`int`, optional
81
+ Desired chunk size. For streaming gz content, must be a multiple of 32kB.
82
+
83
+ Yields
84
+ -------
85
+ iterable
86
+ Chunk iterator that streams the content.
87
+ :class:`int`
88
+ Source size (used later to compute ETA).
89
+ """
90
+ if isinstance(source, str) and source.startswith("https://"):
91
+ # URL HTTP
92
+ with session.get(source, stream=True) as r:
93
+ r.raise_for_status()
94
+ total = int(r.headers.get("content-length", 0)) or None
95
+ yield r.iter_content(chunk_size=chunk_size), total
96
+ else:
97
+ source = Path(source)
98
+ if not source.exists():
99
+ yield [], 0
100
+ return None
101
+ total = source.stat().st_size
102
+ with source.open("rb") as file_handle:
103
+ def read_chunks():
104
+ while True:
105
+ chunk = file_handle.read(chunk_size)
106
+ if not chunk:
107
+ break
108
+ yield chunk
109
+ yield read_chunks(), total
110
+
111
+
112
+ def publis_streamer(source, chunk_size=1024 * 64, encoding="unicode_escape"):
113
+ """
114
+ Parameters
115
+ ----------
116
+ source: :class:`str` or :class:`~pathlib.Path`
117
+ Where the DBLP turtle content is. Can be on a local file or on the Internet.
118
+ chunk_size: :class:`int`, optional
119
+ Desired chunk size. Must be a multiple of 32kB.
120
+ encoding: :class:`str`, default=unicode_escape
121
+ Encoding of stream.
122
+
123
+ Yields
124
+ -------
125
+ key: :class:`str`
126
+ DBLP key.
127
+ title: :class:`str`
128
+ Publication title.
129
+ type: :class:`str`
130
+ Type of publication.
131
+ authors: :class:`dict`
132
+ Publication authors (key -> name).
133
+ venue: :class:`str`
134
+ Publication venue (conf/journal).
135
+ year: :class:`int`
136
+ Year of publication.
137
+ """
138
+ with get_stream(source, chunk_size=chunk_size) as (stream, total):
139
+ with tqdm(total=total, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing") as pbar:
140
+ decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
141
+ text_buffer = ""
142
+ for chunk in stream:
143
+ if not chunk:
144
+ continue
145
+
146
+ pbar.update(len(chunk))
147
+ data = decomp.decompress(chunk)
148
+ if not data:
149
+ continue
150
+ text_buffer += data.decode(encoding, errors="replace")
151
+
152
+ blocks = text_buffer.split("\n\n")
153
+ text_buffer = blocks[-1]
154
+ for block in blocks[:-1]:
155
+ pub = parse_block(block)
156
+ if pub:
157
+ yield pub
158
+
159
+ data = decomp.flush()
160
+ if data:
161
+ text_buffer += data.decode(encoding, errors="replace")
162
+
163
+ if text_buffer:
164
+ blocks = text_buffer.split("\n\n")
165
+ for block in blocks:
166
+ pub = parse_block(block)
167
+ if pub:
168
+ yield pub
gismap/sources/hal.py CHANGED
@@ -44,8 +44,7 @@ class HAL(DB):
44
44
  >>> HAL.search_author("Ana Busic")
45
45
  [HALAuthor(name='Ana Busic', key='anabusic')]
46
46
  >>> HAL.search_author("Potop-Butucaru Maria") # doctest: +NORMALIZE_WHITESPACE
47
- [HALAuthor(name='Potop-Butucaru Maria', key='858256', key_type='pid'),
48
- HALAuthor(name='Potop-Butucaru Maria', key='841868', key_type='pid')]
47
+ [HALAuthor(name='Potop-Butucaru Maria', key='841868', key_type='pid')]
49
48
  >>> diego = HAL.search_author("Diego Perino")
50
49
  >>> diego # doctest: +NORMALIZE_WHITESPACE
51
50
  [HALAuthor(name='Diego Perino', key='847558', key_type='pid'),
@@ -122,8 +121,8 @@ class HAL(DB):
122
121
  >>> diego = publications[2].authors[3]
123
122
  >>> diego
124
123
  HALAuthor(name='Diego Perino', key='Diego Perino', key_type='fullname')
125
- >>> len(diego.get_publications())
126
- 28
124
+ >>> len(diego.get_publications()) > 28
125
+ True
127
126
  >>> publications[-7] # doctest: +NORMALIZE_WHITESPACE
128
127
  HALPublication(title='Upper bounds for stabilization in acyclic preference-based systems',
129
128
  authors=[HALAuthor(name='Fabien Mathieu', key='fabien-mathieu')],
@@ -134,12 +133,12 @@ class HAL(DB):
134
133
 
135
134
  >>> maria = HAL.search_author('Maria Potop-Butucaru')
136
135
  >>> maria # doctest: +NORMALIZE_WHITESPACE
137
- [HALAuthor(name='Maria Potop-Butucaru', key='858256', key_type='pid'),
138
- HALAuthor(name='Maria Potop-Butucaru', key='841868', key_type='pid')]
139
- >>> len(HAL.from_author(maria[0]))
140
- 26
141
- >>> len(maria[1].get_publications())
142
- 124
136
+ [HALAuthor(name='Maria Potop-Butucaru', key='841868', key_type='pid')]
137
+ >>> n_pubs = len(HAL.from_author(maria[0]))
138
+ >>> n_pubs > 200
139
+ True
140
+ >>> n_pubs == len(maria[0].get_publications())
141
+ True
143
142
 
144
143
  Note: an error is raised if not enough data is provided
145
144
 
@@ -173,6 +172,9 @@ class HAL(DB):
173
172
  r = get(api, params=params)
174
173
  response = json.loads(r)["response"]
175
174
  res = [HALPublication.from_json(r) for r in response.get("docs", [])]
175
+ if len(res) == 0 and a.key_type != "fullname":
176
+ name = a.name
177
+ return HAL.from_author(HALAuthor(name=name, key=name, key_type="fullname"))
176
178
  return res
177
179
 
178
180
 
@@ -185,6 +187,13 @@ class HALAuthor(Author, HAL):
185
187
  _img: str = None
186
188
  _cv: bool = None
187
189
 
190
+ def __post_init__(self):
191
+ if self.key and self.key_type is None:
192
+ if self.key.isdigit():
193
+ self.key_type = "pid"
194
+ if " " in self.key:
195
+ self.key_type = "fullname"
196
+
188
197
  def check_cv(self):
189
198
  if self.key_type is not None:
190
199
  self._cv = False