gismap 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gismap/sources/dblp.py CHANGED
@@ -13,7 +13,7 @@ from gismap.utils.requests import get
13
13
  class DBLP(DB):
14
14
  db_name: ClassVar[str] = "dblp"
15
15
  author_backoff: ClassVar[float] = 5.0
16
- publi_backoff: ClassVar[float] = 1.0
16
+ publi_backoff: ClassVar[float] = 5.0
17
17
 
18
18
  @classmethod
19
19
  def search_author(cls, name, wait=True):
@@ -98,6 +98,7 @@ class DBLPAuthor(Author, DBLP):
98
98
  key='conf/sss/Mathieu07')
99
99
 
100
100
  """
101
+
101
102
  key: str
102
103
  aliases: list = field(default_factory=list)
103
104
 
@@ -124,6 +125,27 @@ DBLP_TYPES = {
124
125
 
125
126
  @dataclass(repr=False)
126
127
  class DBLPPublication(Publication, DBLP):
128
+ """
129
+ Publication from the DBLP database.
130
+
131
+ Parameters
132
+ ----------
133
+ title: :class:`str`
134
+ Publication title.
135
+ authors: :class:`list`
136
+ List of :class:`DBLPAuthor` objects.
137
+ venue: :class:`str`
138
+ Publication venue.
139
+ type: :class:`str`
140
+ Publication type.
141
+ year: :class:`int`
142
+ Publication year.
143
+ key: :class:`str`
144
+ DBLP record key.
145
+ metadata: :class:`dict`
146
+ Additional metadata (pages, volume, etc.).
147
+ """
148
+
127
149
  key: str
128
150
  metadata: dict = field(default_factory=dict)
129
151
 
@@ -0,0 +1,187 @@
1
+ import re
2
+ import zlib
3
+ from contextlib import contextmanager
4
+ from pathlib import Path
5
+
6
+ from tqdm.auto import tqdm
7
+
8
+ from gismap.utils.requests import session
9
+ from gismap.sources.dblp import DBLP_TYPES
10
+
11
+ key_re = r"<https://dblp.org/rec/([^>]+)>"
12
+ title_re = r'.*?dblp:title\s+"([^"]+)"'
13
+ type_re = r".*?dblp:bibtexType\s+bibtex:(\w+)"
14
+ authors_re = r".*?dblp:hasSignature\s+(\[.*\])\s*;"
15
+ url_re = r"(?:.*?dblp:primaryDocumentPage <([^>]+)>)?"
16
+ stream_re = r"(?:.*?dblp:publishedInStream ([^;]+) ;)?"
17
+ pages_re = r'(?:.*?dblp:pagination "([^"]+)")?'
18
+ venue_re = r'(?:.*?dblp:publishedIn\s+"([^"]+?)")?'
19
+ year_re = r'.*?"(\d{4})"\^\^<http://www.w3.org/2001/XMLSchema#gYear>'
20
+
21
+ pub_re = re.compile(
22
+ "".join(
23
+ [
24
+ key_re,
25
+ title_re,
26
+ type_re,
27
+ authors_re,
28
+ url_re,
29
+ stream_re,
30
+ pages_re,
31
+ venue_re,
32
+ year_re,
33
+ ]
34
+ ),
35
+ flags=re.S,
36
+ )
37
+
38
+ streams_re = re.compile(r"<https://dblp.org/streams/((?:conf|journals)/[^>]+)>")
39
+
40
+ authid_re = re.compile(
41
+ r'\[.*?signatureDblpName\s*?"([^"]+?)(?:\s+\d+)?".*?signatureCreator\s*<https://dblp.org/pid/([^>]+?)>.*?]',
42
+ flags=re.S,
43
+ )
44
+
45
+
46
+ def parse_block(dblp_block):
47
+ """
48
+ Parameters
49
+ ----------
50
+ dblp_block: :class:`str`
51
+ A DBLP publication, turtle format.
52
+
53
+ Returns
54
+ -------
55
+ key: :class:`str`
56
+ DBLP key.
57
+ title: :class:`str`
58
+ Publication title.
59
+ type: :class:`str`
60
+ Type of publication.
61
+ authors: :class:`dict`
62
+ Publication authors (key -> name)
63
+ url: :class:`str` or :class:`NoneType`
64
+ Publication URL.
65
+ stream: :class:`list` or :class:`NoneType`
66
+ Publication streams (normalized journal/conf).
67
+ pages: :class:`str` or :class:`NoneType`
68
+ Publication pages.
69
+ venue: :class:`str` or :class:`NoneType`
70
+ Publication venue (conf/journal).
71
+ year: :class:`int`
72
+ Year of publication.
73
+ """
74
+ items = pub_re.search(dblp_block)
75
+ if items is None:
76
+ return None
77
+ key, title, typ, authors, url, stream, pages, venue, year = items.groups()
78
+ typ = typ.lower()
79
+ typ = DBLP_TYPES.get(typ, typ)
80
+ if stream:
81
+ stream = streams_re.findall(stream)
82
+ authors = {i: n for n, i in authid_re.findall(authors)}
83
+ if authors:
84
+ return key, title, typ, authors, url, stream, pages, venue, int(year)
85
+ return None
86
+
87
+
88
+ @contextmanager
89
+ def get_stream(source, chunk_size=1024 * 64):
90
+ """
91
+ Parameters
92
+ ----------
93
+ source: :class:`str` or :class:`~pathlib.Path`
94
+ Where the content. Can be on a local file or on the Internet.
95
+ chunk_size: :class:`int`, optional
96
+ Desired chunk size. For streaming gz content, must be a multiple of 32kB.
97
+
98
+ Yields
99
+ -------
100
+ iterable
101
+ Chunk iterator that streams the content.
102
+ :class:`int`
103
+ Source size (used later to compute ETA).
104
+ """
105
+ if isinstance(source, str) and source.startswith("https://"):
106
+ # URL HTTP
107
+ with session.get(source, stream=True) as r:
108
+ r.raise_for_status()
109
+ total = int(r.headers.get("content-length", 0)) or None
110
+ yield r.iter_content(chunk_size=chunk_size), total
111
+ else:
112
+ source = Path(source)
113
+ if not source.exists():
114
+ yield [], 0
115
+ return None
116
+ total = source.stat().st_size
117
+ with source.open("rb") as file_handle:
118
+
119
+ def read_chunks():
120
+ while True:
121
+ chunk = file_handle.read(chunk_size)
122
+ if not chunk:
123
+ break
124
+ yield chunk
125
+
126
+ yield read_chunks(), total
127
+
128
+
129
+ def publis_streamer(source, chunk_size=1024 * 64, encoding="unicode_escape"):
130
+ """
131
+ Parameters
132
+ ----------
133
+ source: :class:`str` or :class:`~pathlib.Path`
134
+ Where the DBLP turtle content is. Can be on a local file or on the Internet.
135
+ chunk_size: :class:`int`, optional
136
+ Desired chunk size. Must be a multiple of 32kB.
137
+ encoding: :class:`str`, default=unicode_escape
138
+ Encoding of stream.
139
+
140
+ Yields
141
+ -------
142
+ key: :class:`str`
143
+ DBLP key.
144
+ title: :class:`str`
145
+ Publication title.
146
+ type: :class:`str`
147
+ Type of publication.
148
+ authors: :class:`dict`
149
+ Publication authors (key -> name).
150
+ venue: :class:`str`
151
+ Publication venue (conf/journal).
152
+ year: :class:`int`
153
+ Year of publication.
154
+ """
155
+ with get_stream(source, chunk_size=chunk_size) as (stream, total):
156
+ with tqdm(
157
+ total=total, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing"
158
+ ) as pbar:
159
+ decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
160
+ text_buffer = ""
161
+ for chunk in stream:
162
+ if not chunk:
163
+ continue
164
+
165
+ pbar.update(len(chunk))
166
+ data = decomp.decompress(chunk)
167
+ if not data:
168
+ continue
169
+ text_buffer += data.decode(encoding, errors="replace")
170
+
171
+ blocks = text_buffer.split("\n\n")
172
+ text_buffer = blocks[-1]
173
+ for block in blocks[:-1]:
174
+ pub = parse_block(block)
175
+ if pub:
176
+ yield pub
177
+
178
+ data = decomp.flush()
179
+ if data:
180
+ text_buffer += data.decode(encoding, errors="replace")
181
+
182
+ if text_buffer:
183
+ blocks = text_buffer.split("\n\n")
184
+ for block in blocks:
185
+ pub = parse_block(block)
186
+ if pub:
187
+ yield pub
gismap/sources/hal.py CHANGED
@@ -121,8 +121,8 @@ class HAL(DB):
121
121
  >>> diego = publications[2].authors[3]
122
122
  >>> diego
123
123
  HALAuthor(name='Diego Perino', key='Diego Perino', key_type='fullname')
124
- >>> len(diego.get_publications())
125
- 28
124
+ >>> len(diego.get_publications()) > 28
125
+ True
126
126
  >>> publications[-7] # doctest: +NORMALIZE_WHITESPACE
127
127
  HALPublication(title='Upper bounds for stabilization in acyclic preference-based systems',
128
128
  authors=[HALAuthor(name='Fabien Mathieu', key='fabien-mathieu')],
@@ -180,6 +180,23 @@ class HAL(DB):
180
180
 
181
181
  @dataclass(repr=False)
182
182
  class HALAuthor(Author, HAL):
183
+ """
184
+ Author from the HAL (Hyper Articles en Ligne) database.
185
+
186
+ HAL is a French open archive for scholarly publications.
187
+
188
+ Parameters
189
+ ----------
190
+ name: :class:`str`
191
+ The author's name.
192
+ key: :class:`str` or :class:`int`, optional
193
+ HAL identifier for the author.
194
+ key_type: :class:`str`, optional
195
+ Type of key ('pid', 'fullname', or None for idHal).
196
+ aliases: :class:`list`
197
+ Alternative names for the author.
198
+ """
199
+
183
200
  key: str | int = None
184
201
  key_type: str = None
185
202
  aliases: list = field(default_factory=list)
@@ -277,6 +294,27 @@ HAL_KEYS = {
277
294
 
278
295
  @dataclass(repr=False)
279
296
  class HALPublication(Publication, HAL):
297
+ """
298
+ Publication from the HAL database.
299
+
300
+ Parameters
301
+ ----------
302
+ title: :class:`str`
303
+ Publication title.
304
+ authors: :class:`list`
305
+ List of :class:`HALAuthor` objects.
306
+ venue: :class:`str`
307
+ Publication venue.
308
+ type: :class:`str`
309
+ Publication type.
310
+ year: :class:`int`
311
+ Publication year.
312
+ key: :class:`str`
313
+ HAL document identifier.
314
+ metadata: :class:`dict`
315
+ Additional metadata (abstract, URL, etc.).
316
+ """
317
+
280
318
  key: str
281
319
  metadata: dict = field(default_factory=dict)
282
320