PyPI - arkindex-client - Versions diffs - 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl - Mend

arkindex-client 1.1.2py3-none-any.whl → 1.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

arkindex/client/client.py CHANGED Viewed

@@ -2,6 +2,7 @@
 """
 Arkindex API Client
 """
+import json
 import logging
 import os
 import warnings
@@ -11,7 +12,6 @@ from urllib.parse import quote, urljoin, urlparse, urlsplit
 import requests
 import typesystem
-import yaml
 from tenacity import (
     before_sleep_log,
     retry,
@@ -30,15 +30,6 @@ logger = logging.getLogger(__name__)
 REQUEST_TIMEOUT = (30, 60)
-try:
-    from yaml import CSafeLoader as SafeLoader
-    logger.debug("Using LibYAML-based parser")
-except ImportError:
-    from yaml import SafeLoader
-    logger.debug("Using default PyYAML parser")
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 DEFAULT_BASE_URL = "https://arkindex.teklia.com/"
@@ -140,11 +131,17 @@ class ArkindexClient:
             if split.scheme == "file" or not (split.scheme or split.netloc):
                 # This is a local path
                 with open(schema_url) as f:
-                    schema = yaml.load(f, Loader=SafeLoader)
+                    schema = json.load(f)
             else:
-                resp = self.session.get(schema_url)
+                resp = self.session.get(
+                    schema_url,
+                    headers={
+                        # Explicitly request an OpenAPI schema in JSON and not YAML
+                        "Accept": "application/vnd.oai.openapi+json, application/json",
+                    },
+                )
                 resp.raise_for_status()
-                schema = yaml.load(resp.content, Loader=SafeLoader)
+                schema = resp.json()
         except Exception as e:
             raise SchemaError(
                 f"Could not retrieve a proper OpenAPI schema from {schema_url}"

arkindex/client/decoders.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
-import cgi
 import json
 import os
 import posixpath
 import shutil
 import tempfile
+from email.message import EmailMessage
 from urllib.parse import urlparse
 from arkindex.compat import DownloadedFile
@@ -213,10 +213,11 @@ def _get_filename_from_content_disposition(content_disposition):
     """
     Determine an output filename based on the `Content-Disposition` header.
     """
-    params = value, params = cgi.parse_header(content_disposition)
+    message = EmailMessage()
+    message["content-disposition"] = content_disposition
+    filename = message["content-disposition"].params.get("filename")
-    if "filename" in params:
-        filename = params["filename"]
+    if filename:
         return _safe_filename(filename)
     return None

arkindex/pagination.py CHANGED Viewed

@@ -51,9 +51,6 @@ class ResponsePaginator(Sized, Iterator):
         self.request_kwargs = request_kwargs
         """Keyword arguments to send to :meth:`arkindex.ArkindexClient.request` with each request."""
-        self.mode = None
-        """`page` for PageNumberPagination endpoints or `cursor` for CursorPagination endpoints."""
         self.count = None
         """Total results count."""
@@ -69,8 +66,26 @@ class ResponsePaginator(Sized, Iterator):
         ), "retries must be a positive integer"
         """Max number of retries per API request"""
-        # First page key is an empty string as we do not know yet the pagination type (e.g. page, cursor)
-        self.initial_page = ""
+        # Detect and store the pagination mode
+        self.mode = None
+        if any(
+            field.name == "cursor"
+            for field in self.client.lookup_operation(self.operation_id).fields
+        ):
+            self.mode = PaginationMode.Cursor
+        elif any(
+            field.name == "page"
+            for field in self.client.lookup_operation(self.operation_id).fields
+        ):
+            self.mode = PaginationMode.PageNumber
+        if not self.mode:
+            raise NotImplementedError(
+                "Pagination only implements page and cursor modes."
+            )
+        # First page key is an empty string by default (to stay coherent with page or cursor modes)
+        self.initial_page = request_kwargs.get(self.mode.value, "")
         # Store retrieved pages remaining retries
         self.pages = {self.initial_page: self.retries}
@@ -90,9 +105,8 @@ class ResponsePaginator(Sized, Iterator):
         Returns False in case the page returned an empty result
         Raises a StopIteration in case there are no pages left to iterate on
         """
-        # Filter out pages with no retries
         # Transform as a list of tuples for simpler output
-        remaining = sorted([(m, v) for m, v in self.pages.items() if v > 0])
+        remaining = [(m, v) for m, v in self.pages.items()]
         # No remaining pages, end of iteration
         if not remaining:
@@ -101,28 +115,53 @@ class ResponsePaginator(Sized, Iterator):
         # Get next page to load
         index, retry = remaining[0]
-        if self.mode:
+        if index:
             self.request_kwargs[self.mode.value] = index
         try:
             extra_kwargs = {}
             if not self.pages_loaded:
-                logger.info(
-                    f"Loading first page on try {self.retries - retry + 1}/{self.retries}"
-                )
+                if (
+                    self.mode == PaginationMode.PageNumber
+                    and self.initial_page
+                    and int(self.initial_page) > 1
+                ) or (self.mode == PaginationMode.Cursor and self.initial_page):
+                    logger.info(
+                        f"Loading page {self.initial_page} on try {self.retries - retry + 1}/{self.retries}"
+                    )
+                else:
+                    logger.info(
+                        f"Loading first page on try {self.retries - retry + 1}/{self.retries}"
+                    )
                 operation_fields = [
                     f.name
                     for f in self.client.lookup_operation(self.operation_id).fields
                 ]
-                # Ask to count results if the operation handle it as we do not know the pagination mode yet
+                # Ask to count results if the operation handle it (this is usually the case with cursors)
                 if "with_count" in operation_fields:
-                    extra_kwargs["with_count"] = "true"
+                    extra_kwargs = {
+                        "with_count": "true",
+                        **extra_kwargs,
+                    }
             else:
-                remaining_count = self.pages_count - self.pages_loaded
-                logger.info(
-                    f"Loading {self.mode.value} {index} on try {self.retries - retry + 1}/{self.retries}"
-                    f" - remains {remaining_count} page{'s' if remaining_count > 1 else ''} to load."
-                )
+                message = f"Loading {self.mode.value} {index} on try {self.retries - retry + 1}/{self.retries}"
+                if self.pages_count is not None:
+                    if self.mode is PaginationMode.Cursor and self.initial_page:
+                        # The number of remaining pages is unknown when an initial cursor is set
+                        max_pages = self.pages_count - self.pages_loaded
+                        message = message + (
+                            f" - remains a maximum of {max_pages} page{'s' if max_pages > 1 else ''} to load."
+                        )
+                    else:
+                        initial = int(self.initial_page) if self.initial_page else 1
+                        remaining_count = (
+                            self.pages_count - self.pages_loaded - (initial - 1)
+                        )
+                        message = message + (
+                            f" - remains {remaining_count} page{'s' if remaining_count > 1 else ''} to load."
+                        )
+                logger.info(message)
             # Fetch the next page
             self.data = self.client.single_request(
@@ -133,33 +172,32 @@ class ResponsePaginator(Sized, Iterator):
             )
             self.results = self.data.get("results", [])
-            if not self.mode and self.data:
-                # Autodetect if this endpoint uses page or cursor pagination
-                if self.data.get("number"):
-                    self.mode = PaginationMode.PageNumber
-                else:
-                    self.mode = PaginationMode.Cursor
+            # Retrieve information on the first page with results count
             if self.count is None and "count" in self.data:
-                # Retrieve information on first page with results count
                 self.count = self.data["count"]
                 if self.count == 0:
                     # Pagination has retrieved 0 results
                     self.pages = {}
                     return False
                 self.pages_count = math.ceil(self.count / len(self.results))
-                logger.info(
-                    f"Pagination will load a total of {self.pages_count} page{'s' if self.pages_count > 1 else ''}."
-                )
-                if self.mode == PaginationMode.PageNumber:
+                if self.mode == PaginationMode.Cursor:
+                    logger.info(
+                        f"Pagination will load a {'maximum' if self.initial_page else 'total'} "
+                        f"of {self.pages_count} page{'s' if self.pages_count > 1 else ''}"
+                    )
+                elif self.mode == PaginationMode.PageNumber:
+                    initial = int(self.initial_page) if self.initial_page else 1
+                    total = self.pages_count - initial + 1
+                    logger.info(
+                        f"Pagination will load a total of {total} page{'s' if total > 1 else ''}."
+                    )
                     # Initialize all pages once
-                    self.pages = {
-                        i: self.retries for i in range(2, self.pages_count + 1)
-                    }
-            elif self.mode == PaginationMode.PageNumber:
-                # Mark page as loaded on other pages
-                del self.pages[index]
+                    self.pages.update(
+                        {
+                            i: self.retries
+                            for i in range(initial + 1, self.pages_count + 1)
+                        }
+                    )
             if self.mode == PaginationMode.Cursor:
                 # Parse next URL to retrieve the cursor of the next page
                 query = urlsplit(self.data["next"]).query
@@ -168,6 +206,9 @@ class ResponsePaginator(Sized, Iterator):
                     self.pages = {}
                 else:
                     self.pages = {cursor_query[0]: self.retries}
+            elif self.mode == PaginationMode.PageNumber:
+                # Mark the current page as loaded
+                del self.pages[index]
             # Stop happy path here, we don't need to process errors
             self.pages_loaded += 1
@@ -201,6 +242,7 @@ class ResponsePaginator(Sized, Iterator):
             logger.warning(error_text)
             if self.allow_missing_data:
                 self.missing.add(index)
+                del self.pages[index]
             else:
                 raise Exception("Stopping pagination as data will be incomplete")

arkindex/schema/validator.py CHANGED Viewed

@@ -1,54 +1,22 @@
 # -*- coding: utf-8 -*-
-import re
 import typing
 import typesystem
 from arkindex.schema.openapi import OPEN_API, OpenAPI
-ENCODING_CHOICES = ["json", "yaml", None]
-# The regexs give us a best-guess for the encoding if none is specified.
-# They check to see if the document looks like it is probably a YAML object or
-# probably a JSON object. It'll typically be best to specify the encoding
-# explicitly, but this should do for convenience.
-INFER_YAML = re.compile(r"^([ \t]*#.*\n|---[ \t]*\n)*\s*[A-Za-z0-9_-]+[ \t]*:")
-INFER_JSON = re.compile(r'^\s*{\s*"[A-Za-z0-9_-]+"\s*:')
-def validate(schema: typing.Union[dict, str, bytes], encoding: str = None):
+def validate(schema: typing.Union[dict, str, bytes]):
     if not isinstance(schema, (dict, str, bytes)):
         raise ValueError("schema must be either str, bytes, or dict.")
-    if encoding not in ENCODING_CHOICES:
-        raise ValueError(f"encoding must be one of {ENCODING_CHOICES!r}")
     if isinstance(schema, bytes):
         schema = schema.decode("utf8", "ignore")
     if isinstance(schema, str):
-        if encoding is None:
-            if INFER_YAML.match(schema):
-                encoding = "yaml"
-            elif INFER_JSON.match(schema):
-                encoding = "json"
-            else:
-                text = "Could not determine if content is JSON or YAML."
-                code = "unknown_encoding"
-                position = typesystem.Position(line_no=1, column_no=1, char_index=0)
-                raise typesystem.ParseError(text=text, code=code, position=position)
-        tokenize = {"yaml": typesystem.tokenize_yaml, "json": typesystem.tokenize_json}[
-            encoding
-        ]
-        token = tokenize(schema)
-        value = token.value
-    else:
-        token = None
-        value = schema
-    if token is not None:
+        token = typesystem.tokenize_json(schema)
         value = typesystem.validate_with_positions(token=token, validator=OpenAPI)
     else:
-        value = OPEN_API.validate(value)
+        value = OPEN_API.validate(schema)
     return OpenAPI().load(value)

{arkindex_client-1.1.2.dist-info → arkindex_client-1.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: arkindex-client
-Version: 1.1.2
+Version: 1.1.3
 Summary: API client for the Arkindex project
 Home-page: https://gitlab.teklia.com/arkindex/api-client
 Author: Teklia <contact@teklia.com>
@@ -20,7 +20,6 @@ Classifier: Topic :: Text Processing :: Indexing
 Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.8
 License-File: LICENSE
-Requires-Dist: PyYAML==6.0.2
 Requires-Dist: requests~=2.28
 Requires-Dist: tenacity==8.2.3
 Requires-Dist: typesystem==0.4.1

{arkindex_client-1.1.2.dist-info → arkindex_client-1.1.3.dist-info}/RECORD RENAMED Viewed

@@ -6,15 +6,15 @@ arkindex/compat.py,sha256=Kjxu--QoF8sBxKOvXMtNcDQ0XK7MLc_2C8Q2knll4Lk,805
 arkindex/document.py,sha256=YyqSm3Kc35j3iWuJujyfrLfMy-gNydBtmcR45pUtfC4,3732
 arkindex/exceptions.py,sha256=hDxbgC7uAD8wbTQS1DaEJZ25Nun41Io8Y0BiwrZ1ZSM,2016
 arkindex/mock.py,sha256=olYBFCkLQuuf9gGu7wlmZiLFMQknGGi8evS531RjjUE,2755
-arkindex/pagination.py,sha256=c6dG_OkQDG00ZfGUbHuZxu-UvOpmYf7dJP1ZaUaha1Y,9008
+arkindex/pagination.py,sha256=kzOyl2oMqGyyVy7LG7eKK9wAI6YdVzEeOuBEbkdw5Zo,11002
 arkindex/client/__init__.py,sha256=g_G_bSfMbduYzpi9iURTn0cYLV4nMulDR8rD7x-DLyc,142
-arkindex/client/client.py,sha256=UEkOYXg9HUdKd_20VNpNpXRLWvwTeKpbu-IkV6Xfv2I,15416
-arkindex/client/decoders.py,sha256=F_uBGOrh1BFnZzuW0MTjwm8wAArbybPsrIVol_2vYN0,7886
+arkindex/client/client.py,sha256=ZOcMY5IjOUxADkCsCdpgylJM-oJVY_XObRVH4p4onVI,15428
+arkindex/client/decoders.py,sha256=WmRdqNAFv884XIfHXTkQoohJMapeTq8AqQzsW26K-t4,7952
 arkindex/schema/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arkindex/schema/openapi.py,sha256=HHAHyUxqa6sK1l8aEy7SHx-9w20Pbov54AB4rHPjguk,9183
-arkindex/schema/validator.py,sha256=N2sda7vxfivw68VuyX-MfmUlrXjf_LEHNg6OCKc1mjQ,1926
-arkindex_client-1.1.2.dist-info/LICENSE,sha256=s7yDHdG8liSj2PiaVwRi9G5wR1qDXSPmhPJPgWbrkWU,34504
-arkindex_client-1.1.2.dist-info/METADATA,sha256=DsYRcnTdzzByCuCFxpIPxL1pagT_pN0rPzztb0Cb0y8,1080
-arkindex_client-1.1.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-arkindex_client-1.1.2.dist-info/top_level.txt,sha256=ALyF0lTPpxOheUGmSVwEhgI6eMYwm_9Eu37G-RwGBRM,17
-arkindex_client-1.1.2.dist-info/RECORD,,
+arkindex/schema/validator.py,sha256=Baq2TtqMWZVRU_SYF7aUJ0Y80t-CIboCtK_GV8TPNKE,625
+arkindex_client-1.1.3.dist-info/LICENSE,sha256=s7yDHdG8liSj2PiaVwRi9G5wR1qDXSPmhPJPgWbrkWU,34504
+arkindex_client-1.1.3.dist-info/METADATA,sha256=rOgOnPbp4NP2WnhQMNwx61OIcVQX2nfz-6XrFeO_nQ8,1051
+arkindex_client-1.1.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+arkindex_client-1.1.3.dist-info/top_level.txt,sha256=ALyF0lTPpxOheUGmSVwEhgI6eMYwm_9Eu37G-RwGBRM,17
+arkindex_client-1.1.3.dist-info/RECORD,,

{arkindex_client-1.1.2.dist-info → arkindex_client-1.1.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.44.0)
+Generator: bdist_wheel (0.45.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{arkindex_client-1.1.2.dist-info → arkindex_client-1.1.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{arkindex_client-1.1.2.dist-info → arkindex_client-1.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

arkindex-client 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl

arkindex-client 1.1.2py3-none-any.whl → 1.1.3py3-none-any.whl