gismap 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,24 +8,39 @@ from tqdm.auto import tqdm
8
8
  from gismap.utils.requests import session
9
9
  from gismap.sources.dblp import DBLP_TYPES
10
10
 
11
- key_re = r'<https://dblp.org/rec/([^>]+)>'
11
+ key_re = r"<https://dblp.org/rec/([^>]+)>"
12
12
  title_re = r'.*?dblp:title\s+"([^"]+)"'
13
- type_re = r'.*?dblp:bibtexType\s+bibtex:(\w+)'
14
- authors_re = r'.*?dblp:hasSignature\s+(\[.*\])\s*;'
15
- url_re = r'(?:.*?dblp:primaryDocumentPage <([^>]+)>)?'
16
- stream_re = r'(?:.*?dblp:publishedInStream ([^;]+) ;)?'
13
+ type_re = r".*?dblp:bibtexType\s+bibtex:(\w+)"
14
+ authors_re = r".*?dblp:hasSignature\s+(\[.*\])\s*;"
15
+ url_re = r"(?:.*?dblp:primaryDocumentPage <([^>]+)>)?"
16
+ stream_re = r"(?:.*?dblp:publishedInStream ([^;]+) ;)?"
17
17
  pages_re = r'(?:.*?dblp:pagination "([^"]+)")?'
18
18
  venue_re = r'(?:.*?dblp:publishedIn\s+"([^"]+?)")?'
19
19
  year_re = r'.*?"(\d{4})"\^\^<http://www.w3.org/2001/XMLSchema#gYear>'
20
20
 
21
- pub_re = re.compile("".join([key_re, title_re, type_re, authors_re,
22
- url_re, stream_re, pages_re, venue_re, year_re]), flags=re.S)
23
-
24
- streams_re = re.compile(r'<https://dblp.org/streams/((?:conf|journals)/[^>]+)>')
21
+ pub_re = re.compile(
22
+ "".join(
23
+ [
24
+ key_re,
25
+ title_re,
26
+ type_re,
27
+ authors_re,
28
+ url_re,
29
+ stream_re,
30
+ pages_re,
31
+ venue_re,
32
+ year_re,
33
+ ]
34
+ ),
35
+ flags=re.S,
36
+ )
37
+
38
+ streams_re = re.compile(r"<https://dblp.org/streams/((?:conf|journals)/[^>]+)>")
25
39
 
26
40
  authid_re = re.compile(
27
41
  r'\[.*?signatureDblpName\s*?"([^"]+?)(?:\s+\d+)?".*?signatureCreator\s*<https://dblp.org/pid/([^>]+?)>.*?]',
28
- flags=re.S)
42
+ flags=re.S,
43
+ )
29
44
 
30
45
 
31
46
  def parse_block(dblp_block):
@@ -100,12 +115,14 @@ def get_stream(source, chunk_size=1024 * 64):
100
115
  return None
101
116
  total = source.stat().st_size
102
117
  with source.open("rb") as file_handle:
118
+
103
119
  def read_chunks():
104
120
  while True:
105
121
  chunk = file_handle.read(chunk_size)
106
122
  if not chunk:
107
123
  break
108
124
  yield chunk
125
+
109
126
  yield read_chunks(), total
110
127
 
111
128
 
@@ -136,7 +153,9 @@ def publis_streamer(source, chunk_size=1024 * 64, encoding="unicode_escape"):
136
153
  Year of publication.
137
154
  """
138
155
  with get_stream(source, chunk_size=chunk_size) as (stream, total):
139
- with tqdm(total=total, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing") as pbar:
156
+ with tqdm(
157
+ total=total, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing"
158
+ ) as pbar:
140
159
  decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
141
160
  text_buffer = ""
142
161
  for chunk in stream:
gismap/sources/hal.py CHANGED
@@ -180,6 +180,23 @@ class HAL(DB):
180
180
 
181
181
  @dataclass(repr=False)
182
182
  class HALAuthor(Author, HAL):
183
+ """
184
+ Author from the HAL (Hyper Articles en Ligne) database.
185
+
186
+ HAL is a French open archive for scholarly publications.
187
+
188
+ Parameters
189
+ ----------
190
+ name: :class:`str`
191
+ The author's name.
192
+ key: :class:`str` or :class:`int`, optional
193
+ HAL identifier for the author.
194
+ key_type: :class:`str`, optional
195
+ Type of key ('pid', 'fullname', or None for idHal).
196
+ aliases: :class:`list`
197
+ Alternative names for the author.
198
+ """
199
+
183
200
  key: str | int = None
184
201
  key_type: str = None
185
202
  aliases: list = field(default_factory=list)
@@ -277,6 +294,27 @@ HAL_KEYS = {
277
294
 
278
295
  @dataclass(repr=False)
279
296
  class HALPublication(Publication, HAL):
297
+ """
298
+ Publication from the HAL database.
299
+
300
+ Parameters
301
+ ----------
302
+ title: :class:`str`
303
+ Publication title.
304
+ authors: :class:`list`
305
+ List of :class:`HALAuthor` objects.
306
+ venue: :class:`str`
307
+ Publication venue.
308
+ type: :class:`str`
309
+ Publication type.
310
+ year: :class:`int`
311
+ Publication year.
312
+ key: :class:`str`
313
+ HAL document identifier.
314
+ metadata: :class:`dict`
315
+ Additional metadata (abstract, URL, etc.).
316
+ """
317
+
280
318
  key: str
281
319
  metadata: dict = field(default_factory=dict)
282
320