arkindex-client 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arkindex/client/client.py CHANGED
@@ -2,6 +2,7 @@
2
2
  """
3
3
  Arkindex API Client
4
4
  """
5
+ import json
5
6
  import logging
6
7
  import os
7
8
  import warnings
@@ -11,7 +12,6 @@ from urllib.parse import quote, urljoin, urlparse, urlsplit
11
12
 
12
13
  import requests
13
14
  import typesystem
14
- import yaml
15
15
  from tenacity import (
16
16
  before_sleep_log,
17
17
  retry,
@@ -30,15 +30,6 @@ logger = logging.getLogger(__name__)
30
30
 
31
31
  REQUEST_TIMEOUT = (30, 60)
32
32
 
33
- try:
34
- from yaml import CSafeLoader as SafeLoader
35
-
36
- logger.debug("Using LibYAML-based parser")
37
- except ImportError:
38
- from yaml import SafeLoader
39
-
40
- logger.debug("Using default PyYAML parser")
41
-
42
33
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
43
34
 
44
35
  DEFAULT_BASE_URL = "https://arkindex.teklia.com/"
@@ -140,11 +131,17 @@ class ArkindexClient:
140
131
  if split.scheme == "file" or not (split.scheme or split.netloc):
141
132
  # This is a local path
142
133
  with open(schema_url) as f:
143
- schema = yaml.load(f, Loader=SafeLoader)
134
+ schema = json.load(f)
144
135
  else:
145
- resp = self.session.get(schema_url)
136
+ resp = self.session.get(
137
+ schema_url,
138
+ headers={
139
+ # Explicitly request an OpenAPI schema in JSON and not YAML
140
+ "Accept": "application/vnd.oai.openapi+json, application/json",
141
+ },
142
+ )
146
143
  resp.raise_for_status()
147
- schema = yaml.load(resp.content, Loader=SafeLoader)
144
+ schema = resp.json()
148
145
  except Exception as e:
149
146
  raise SchemaError(
150
147
  f"Could not retrieve a proper OpenAPI schema from {schema_url}"
@@ -1,10 +1,10 @@
1
1
  # -*- coding: utf-8 -*-
2
- import cgi
3
2
  import json
4
3
  import os
5
4
  import posixpath
6
5
  import shutil
7
6
  import tempfile
7
+ from email.message import EmailMessage
8
8
  from urllib.parse import urlparse
9
9
 
10
10
  from arkindex.compat import DownloadedFile
@@ -213,10 +213,11 @@ def _get_filename_from_content_disposition(content_disposition):
213
213
  """
214
214
  Determine an output filename based on the `Content-Disposition` header.
215
215
  """
216
- params = value, params = cgi.parse_header(content_disposition)
216
+ message = EmailMessage()
217
+ message["content-disposition"] = content_disposition
218
+ filename = message["content-disposition"].params.get("filename")
217
219
 
218
- if "filename" in params:
219
- filename = params["filename"]
220
+ if filename:
220
221
  return _safe_filename(filename)
221
222
 
222
223
  return None
arkindex/pagination.py CHANGED
@@ -51,9 +51,6 @@ class ResponsePaginator(Sized, Iterator):
51
51
  self.request_kwargs = request_kwargs
52
52
  """Keyword arguments to send to :meth:`arkindex.ArkindexClient.request` with each request."""
53
53
 
54
- self.mode = None
55
- """`page` for PageNumberPagination endpoints or `cursor` for CursorPagination endpoints."""
56
-
57
54
  self.count = None
58
55
  """Total results count."""
59
56
 
@@ -69,8 +66,26 @@ class ResponsePaginator(Sized, Iterator):
69
66
  ), "retries must be a positive integer"
70
67
  """Max number of retries per API request"""
71
68
 
72
- # First page key is an empty string as we do not know yet the pagination type (e.g. page, cursor)
73
- self.initial_page = ""
69
+ # Detect and store the pagination mode
70
+ self.mode = None
71
+ if any(
72
+ field.name == "cursor"
73
+ for field in self.client.lookup_operation(self.operation_id).fields
74
+ ):
75
+ self.mode = PaginationMode.Cursor
76
+ elif any(
77
+ field.name == "page"
78
+ for field in self.client.lookup_operation(self.operation_id).fields
79
+ ):
80
+ self.mode = PaginationMode.PageNumber
81
+ if not self.mode:
82
+ raise NotImplementedError(
83
+ "Pagination only implements page and cursor modes."
84
+ )
85
+
86
+ # First page key is an empty string by default (to stay coherent with page or cursor modes)
87
+ self.initial_page = request_kwargs.get(self.mode.value, "")
88
+
74
89
  # Store retrieved pages remaining retries
75
90
  self.pages = {self.initial_page: self.retries}
76
91
 
@@ -90,9 +105,8 @@ class ResponsePaginator(Sized, Iterator):
90
105
  Returns False in case the page returned an empty result
91
106
  Raises a StopIteration in case there are no pages left to iterate on
92
107
  """
93
- # Filter out pages with no retries
94
108
  # Transform as a list of tuples for simpler output
95
- remaining = sorted([(m, v) for m, v in self.pages.items() if v > 0])
109
+ remaining = [(m, v) for m, v in self.pages.items()]
96
110
 
97
111
  # No remaining pages, end of iteration
98
112
  if not remaining:
@@ -101,28 +115,53 @@ class ResponsePaginator(Sized, Iterator):
101
115
  # Get next page to load
102
116
  index, retry = remaining[0]
103
117
 
104
- if self.mode:
118
+ if index:
105
119
  self.request_kwargs[self.mode.value] = index
106
120
 
107
121
  try:
108
122
  extra_kwargs = {}
109
123
  if not self.pages_loaded:
110
- logger.info(
111
- f"Loading first page on try {self.retries - retry + 1}/{self.retries}"
112
- )
124
+ if (
125
+ self.mode == PaginationMode.PageNumber
126
+ and self.initial_page
127
+ and int(self.initial_page) > 1
128
+ ) or (self.mode == PaginationMode.Cursor and self.initial_page):
129
+ logger.info(
130
+ f"Loading page {self.initial_page} on try {self.retries - retry + 1}/{self.retries}"
131
+ )
132
+ else:
133
+ logger.info(
134
+ f"Loading first page on try {self.retries - retry + 1}/{self.retries}"
135
+ )
113
136
  operation_fields = [
114
137
  f.name
115
138
  for f in self.client.lookup_operation(self.operation_id).fields
116
139
  ]
117
- # Ask to count results if the operation handle it as we do not know the pagination mode yet
140
+ # Ask to count results if the operation handle it (this is usually the case with cursors)
118
141
  if "with_count" in operation_fields:
119
- extra_kwargs["with_count"] = "true"
142
+ extra_kwargs = {
143
+ "with_count": "true",
144
+ **extra_kwargs,
145
+ }
120
146
  else:
121
- remaining_count = self.pages_count - self.pages_loaded
122
- logger.info(
123
- f"Loading {self.mode.value} {index} on try {self.retries - retry + 1}/{self.retries}"
124
- f" - remains {remaining_count} page{'s' if remaining_count > 1 else ''} to load."
125
- )
147
+ message = f"Loading {self.mode.value} {index} on try {self.retries - retry + 1}/{self.retries}"
148
+ if self.pages_count is not None:
149
+ if self.mode is PaginationMode.Cursor and self.initial_page:
150
+ # The number of remaining pages is unknown when an initial cursor is set
151
+ max_pages = self.pages_count - self.pages_loaded
152
+ message = message + (
153
+ f" - remains a maximum of {max_pages} page{'s' if max_pages > 1 else ''} to load."
154
+ )
155
+ else:
156
+ initial = int(self.initial_page) if self.initial_page else 1
157
+ remaining_count = (
158
+ self.pages_count - self.pages_loaded - (initial - 1)
159
+ )
160
+ message = message + (
161
+ f" - remains {remaining_count} page{'s' if remaining_count > 1 else ''} to load."
162
+ )
163
+
164
+ logger.info(message)
126
165
 
127
166
  # Fetch the next page
128
167
  self.data = self.client.single_request(
@@ -133,33 +172,32 @@ class ResponsePaginator(Sized, Iterator):
133
172
  )
134
173
  self.results = self.data.get("results", [])
135
174
 
136
- if not self.mode and self.data:
137
- # Autodetect if this endpoint uses page or cursor pagination
138
- if self.data.get("number"):
139
- self.mode = PaginationMode.PageNumber
140
- else:
141
- self.mode = PaginationMode.Cursor
142
-
175
+ # Retrieve information on the first page with results count
143
176
  if self.count is None and "count" in self.data:
144
- # Retrieve information on first page with results count
145
177
  self.count = self.data["count"]
146
178
  if self.count == 0:
147
179
  # Pagination has retrieved 0 results
148
180
  self.pages = {}
149
181
  return False
150
182
  self.pages_count = math.ceil(self.count / len(self.results))
151
- logger.info(
152
- f"Pagination will load a total of {self.pages_count} page{'s' if self.pages_count > 1 else ''}."
153
- )
154
- if self.mode == PaginationMode.PageNumber:
183
+ if self.mode == PaginationMode.Cursor:
184
+ logger.info(
185
+ f"Pagination will load a {'maximum' if self.initial_page else 'total'} "
186
+ f"of {self.pages_count} page{'s' if self.pages_count > 1 else ''}"
187
+ )
188
+ elif self.mode == PaginationMode.PageNumber:
189
+ initial = int(self.initial_page) if self.initial_page else 1
190
+ total = self.pages_count - initial + 1
191
+ logger.info(
192
+ f"Pagination will load a total of {total} page{'s' if total > 1 else ''}."
193
+ )
155
194
  # Initialize all pages once
156
- self.pages = {
157
- i: self.retries for i in range(2, self.pages_count + 1)
158
- }
159
- elif self.mode == PaginationMode.PageNumber:
160
- # Mark page as loaded on other pages
161
- del self.pages[index]
162
-
195
+ self.pages.update(
196
+ {
197
+ i: self.retries
198
+ for i in range(initial + 1, self.pages_count + 1)
199
+ }
200
+ )
163
201
  if self.mode == PaginationMode.Cursor:
164
202
  # Parse next URL to retrieve the cursor of the next page
165
203
  query = urlsplit(self.data["next"]).query
@@ -168,6 +206,9 @@ class ResponsePaginator(Sized, Iterator):
168
206
  self.pages = {}
169
207
  else:
170
208
  self.pages = {cursor_query[0]: self.retries}
209
+ elif self.mode == PaginationMode.PageNumber:
210
+ # Mark the current page as loaded
211
+ del self.pages[index]
171
212
 
172
213
  # Stop happy path here, we don't need to process errors
173
214
  self.pages_loaded += 1
@@ -201,6 +242,7 @@ class ResponsePaginator(Sized, Iterator):
201
242
  logger.warning(error_text)
202
243
  if self.allow_missing_data:
203
244
  self.missing.add(index)
245
+ del self.pages[index]
204
246
  else:
205
247
  raise Exception("Stopping pagination as data will be incomplete")
206
248
 
@@ -1,54 +1,22 @@
1
1
  # -*- coding: utf-8 -*-
2
- import re
3
2
  import typing
4
3
 
5
4
  import typesystem
6
5
 
7
6
  from arkindex.schema.openapi import OPEN_API, OpenAPI
8
7
 
9
- ENCODING_CHOICES = ["json", "yaml", None]
10
8
 
11
- # The regexs give us a best-guess for the encoding if none is specified.
12
- # They check to see if the document looks like it is probably a YAML object or
13
- # probably a JSON object. It'll typically be best to specify the encoding
14
- # explicitly, but this should do for convenience.
15
- INFER_YAML = re.compile(r"^([ \t]*#.*\n|---[ \t]*\n)*\s*[A-Za-z0-9_-]+[ \t]*:")
16
- INFER_JSON = re.compile(r'^\s*{\s*"[A-Za-z0-9_-]+"\s*:')
17
-
18
-
19
- def validate(schema: typing.Union[dict, str, bytes], encoding: str = None):
9
+ def validate(schema: typing.Union[dict, str, bytes]):
20
10
  if not isinstance(schema, (dict, str, bytes)):
21
11
  raise ValueError("schema must be either str, bytes, or dict.")
22
- if encoding not in ENCODING_CHOICES:
23
- raise ValueError(f"encoding must be one of {ENCODING_CHOICES!r}")
24
12
 
25
13
  if isinstance(schema, bytes):
26
14
  schema = schema.decode("utf8", "ignore")
27
15
 
28
16
  if isinstance(schema, str):
29
- if encoding is None:
30
- if INFER_YAML.match(schema):
31
- encoding = "yaml"
32
- elif INFER_JSON.match(schema):
33
- encoding = "json"
34
- else:
35
- text = "Could not determine if content is JSON or YAML."
36
- code = "unknown_encoding"
37
- position = typesystem.Position(line_no=1, column_no=1, char_index=0)
38
- raise typesystem.ParseError(text=text, code=code, position=position)
39
-
40
- tokenize = {"yaml": typesystem.tokenize_yaml, "json": typesystem.tokenize_json}[
41
- encoding
42
- ]
43
- token = tokenize(schema)
44
- value = token.value
45
- else:
46
- token = None
47
- value = schema
48
-
49
- if token is not None:
17
+ token = typesystem.tokenize_json(schema)
50
18
  value = typesystem.validate_with_positions(token=token, validator=OpenAPI)
51
19
  else:
52
- value = OPEN_API.validate(value)
20
+ value = OPEN_API.validate(schema)
53
21
 
54
22
  return OpenAPI().load(value)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: arkindex-client
3
- Version: 1.1.2
3
+ Version: 1.1.3
4
4
  Summary: API client for the Arkindex project
5
5
  Home-page: https://gitlab.teklia.com/arkindex/api-client
6
6
  Author: Teklia <contact@teklia.com>
@@ -20,7 +20,6 @@ Classifier: Topic :: Text Processing :: Indexing
20
20
  Classifier: Topic :: Text Processing :: Linguistic
21
21
  Requires-Python: >=3.8
22
22
  License-File: LICENSE
23
- Requires-Dist: PyYAML==6.0.2
24
23
  Requires-Dist: requests~=2.28
25
24
  Requires-Dist: tenacity==8.2.3
26
25
  Requires-Dist: typesystem==0.4.1
@@ -6,15 +6,15 @@ arkindex/compat.py,sha256=Kjxu--QoF8sBxKOvXMtNcDQ0XK7MLc_2C8Q2knll4Lk,805
6
6
  arkindex/document.py,sha256=YyqSm3Kc35j3iWuJujyfrLfMy-gNydBtmcR45pUtfC4,3732
7
7
  arkindex/exceptions.py,sha256=hDxbgC7uAD8wbTQS1DaEJZ25Nun41Io8Y0BiwrZ1ZSM,2016
8
8
  arkindex/mock.py,sha256=olYBFCkLQuuf9gGu7wlmZiLFMQknGGi8evS531RjjUE,2755
9
- arkindex/pagination.py,sha256=c6dG_OkQDG00ZfGUbHuZxu-UvOpmYf7dJP1ZaUaha1Y,9008
9
+ arkindex/pagination.py,sha256=kzOyl2oMqGyyVy7LG7eKK9wAI6YdVzEeOuBEbkdw5Zo,11002
10
10
  arkindex/client/__init__.py,sha256=g_G_bSfMbduYzpi9iURTn0cYLV4nMulDR8rD7x-DLyc,142
11
- arkindex/client/client.py,sha256=UEkOYXg9HUdKd_20VNpNpXRLWvwTeKpbu-IkV6Xfv2I,15416
12
- arkindex/client/decoders.py,sha256=F_uBGOrh1BFnZzuW0MTjwm8wAArbybPsrIVol_2vYN0,7886
11
+ arkindex/client/client.py,sha256=ZOcMY5IjOUxADkCsCdpgylJM-oJVY_XObRVH4p4onVI,15428
12
+ arkindex/client/decoders.py,sha256=WmRdqNAFv884XIfHXTkQoohJMapeTq8AqQzsW26K-t4,7952
13
13
  arkindex/schema/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  arkindex/schema/openapi.py,sha256=HHAHyUxqa6sK1l8aEy7SHx-9w20Pbov54AB4rHPjguk,9183
15
- arkindex/schema/validator.py,sha256=N2sda7vxfivw68VuyX-MfmUlrXjf_LEHNg6OCKc1mjQ,1926
16
- arkindex_client-1.1.2.dist-info/LICENSE,sha256=s7yDHdG8liSj2PiaVwRi9G5wR1qDXSPmhPJPgWbrkWU,34504
17
- arkindex_client-1.1.2.dist-info/METADATA,sha256=DsYRcnTdzzByCuCFxpIPxL1pagT_pN0rPzztb0Cb0y8,1080
18
- arkindex_client-1.1.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
19
- arkindex_client-1.1.2.dist-info/top_level.txt,sha256=ALyF0lTPpxOheUGmSVwEhgI6eMYwm_9Eu37G-RwGBRM,17
20
- arkindex_client-1.1.2.dist-info/RECORD,,
15
+ arkindex/schema/validator.py,sha256=Baq2TtqMWZVRU_SYF7aUJ0Y80t-CIboCtK_GV8TPNKE,625
16
+ arkindex_client-1.1.3.dist-info/LICENSE,sha256=s7yDHdG8liSj2PiaVwRi9G5wR1qDXSPmhPJPgWbrkWU,34504
17
+ arkindex_client-1.1.3.dist-info/METADATA,sha256=rOgOnPbp4NP2WnhQMNwx61OIcVQX2nfz-6XrFeO_nQ8,1051
18
+ arkindex_client-1.1.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
19
+ arkindex_client-1.1.3.dist-info/top_level.txt,sha256=ALyF0lTPpxOheUGmSVwEhgI6eMYwm_9Eu37G-RwGBRM,17
20
+ arkindex_client-1.1.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5