PyPI - cognite-extractor-utils - Versions diffs - 7.5.14__py3-none-any.whl → 7.6.0__py3-none-any.whl - Mend

cognite-extractor-utils 7.5.14py3-none-any.whl → 7.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cognite-extractor-utils might be problematic. Click here for more details.

Files changed (47) hide show

cognite/extractorutils/__init__.py +1 -1
cognite/extractorutils/_inner_util.py +1 -1
cognite/extractorutils/base.py +120 -40
cognite/extractorutils/configtools/__init__.py +4 -5
cognite/extractorutils/configtools/_util.py +3 -2
cognite/extractorutils/configtools/elements.py +206 -33
cognite/extractorutils/configtools/loaders.py +68 -16
cognite/extractorutils/configtools/validators.py +5 -1
cognite/extractorutils/exceptions.py +11 -2
cognite/extractorutils/metrics.py +17 -12
cognite/extractorutils/statestore/__init__.py +77 -3
cognite/extractorutils/statestore/_base.py +7 -3
cognite/extractorutils/statestore/hashing.py +129 -15
cognite/extractorutils/statestore/watermark.py +77 -87
cognite/extractorutils/threading.py +30 -4
cognite/extractorutils/unstable/__init__.py +5 -5
cognite/extractorutils/unstable/configuration/__init__.py +3 -0
cognite/extractorutils/unstable/configuration/exceptions.py +13 -2
cognite/extractorutils/unstable/configuration/loaders.py +78 -13
cognite/extractorutils/unstable/configuration/models.py +121 -7
cognite/extractorutils/unstable/core/__init__.py +5 -0
cognite/extractorutils/unstable/core/_dto.py +5 -3
cognite/extractorutils/unstable/core/base.py +113 -4
cognite/extractorutils/unstable/core/errors.py +41 -0
cognite/extractorutils/unstable/core/logger.py +149 -0
cognite/extractorutils/unstable/core/restart_policy.py +16 -2
cognite/extractorutils/unstable/core/runtime.py +44 -6
cognite/extractorutils/unstable/core/tasks.py +53 -1
cognite/extractorutils/unstable/scheduling/__init__.py +13 -0
cognite/extractorutils/unstable/scheduling/_scheduler.py +1 -1
cognite/extractorutils/uploader/__init__.py +7 -5
cognite/extractorutils/uploader/_base.py +4 -5
cognite/extractorutils/uploader/assets.py +13 -8
cognite/extractorutils/uploader/data_modeling.py +37 -2
cognite/extractorutils/uploader/events.py +14 -9
cognite/extractorutils/uploader/files.py +80 -21
cognite/extractorutils/uploader/raw.py +12 -7
cognite/extractorutils/uploader/time_series.py +58 -49
cognite/extractorutils/uploader/upload_failure_handler.py +35 -2
cognite/extractorutils/uploader_extractor.py +29 -6
cognite/extractorutils/uploader_types.py +15 -1
cognite/extractorutils/util.py +76 -23
{cognite_extractor_utils-7.5.14.dist-info → cognite_extractor_utils-7.6.0.dist-info}/METADATA +1 -1
cognite_extractor_utils-7.6.0.dist-info/RECORD +50 -0
cognite_extractor_utils-7.5.14.dist-info/RECORD +0 -50
{cognite_extractor_utils-7.5.14.dist-info → cognite_extractor_utils-7.6.0.dist-info}/WHEEL +0 -0
{cognite_extractor_utils-7.5.14.dist-info → cognite_extractor_utils-7.6.0.dist-info}/licenses/LICENSE +0 -0

cognite/extractorutils/uploader/time_series.py CHANGED Viewed

@@ -1,3 +1,6 @@
+"""
+Upload queue for time series and sequences.
+"""
 #  Copyright 2023 Cognite AS
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
@@ -81,7 +84,7 @@ def default_time_series_factory(external_id: str, datapoints: DataPointList) ->
 class TimeSeriesUploadQueue(AbstractUploadQueue):
     """
-    Upload queue for time series
+    Upload queue for time series.
     Args:
         cdf_client: Cognite Data Fusion client to use
@@ -141,7 +144,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
         self.data_set_id = data_set_id
     def _verify_datapoint_time(self, time: int | float | datetime | str) -> bool:
-        if isinstance(time, int) or isinstance(time, float):
+        if isinstance(time, int | float):
             return not math.isnan(time) and time >= MIN_DATAPOINT_TIMESTAMP
         elif isinstance(time, str):
             return False
@@ -155,10 +158,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
             )
         elif isinstance(value, str):
             return len(value) <= MAX_DATAPOINT_STRING_LENGTH
-        elif isinstance(value, datetime):
-            return False
-        else:
-            return True
+        return not isinstance(value, datetime)
     def _is_datapoint_valid(
         self,
@@ -172,11 +172,16 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
             return True
     def add_to_upload_queue(
-        self, *, id: int | None = None, external_id: str | None = None, datapoints: DataPointList | None = None
+        self,
+        *,
+        id: int | None = None,  # noqa: A002
+        external_id: str | None = None,
+        datapoints: DataPointList | None = None,
     ) -> None:
         """
-        Add data points to upload queue. The queue will be uploaded if the queue size is larger than the threshold
-        specified in the __init__.
+        Add data points to upload queue.
+        The queue will be uploaded if the queue size is larger than the threshold specified in the ``__init__``.
         Args:
             id: Internal ID of time series. Either this or external_id must be set.
@@ -209,7 +214,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
     def upload(self) -> None:
         """
-        Trigger an upload of the queue, clears queue afterwards
+        Trigger an upload of the queue, clears queue afterwards.
         """
         @retry(
@@ -239,9 +244,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
                 if self.create_missing:
                     # Get the time series that can be created
-                    create_these_ids = set(
-                        [id_dict["externalId"] for id_dict in ex.not_found if "externalId" in id_dict]
-                    )
+                    create_these_ids = {id_dict["externalId"] for id_dict in ex.not_found if "externalId" in id_dict}
                     datapoints_lists: dict[str, DataPointList] = {
                         ts_dict["externalId"]: ts_dict["datapoints"]
                         for ts_dict in upload_this
@@ -294,7 +297,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
                 ]
             )
-            for _either_id, datapoints in self.upload_queue.items():
+            for datapoints in self.upload_queue.values():
                 self.points_written.inc(len(datapoints))
             try:
@@ -309,7 +312,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
     def __enter__(self) -> "TimeSeriesUploadQueue":
         """
-        Wraps around start method, for use as context manager
+        Wraps around start method, for use as context manager.
         Returns:
             self
@@ -321,7 +324,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
         self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
     ) -> None:
         """
-        Wraps around stop method, for use as context manager
+        Wraps around stop method, for use as context manager.
         Args:
             exc_type: Exception type
@@ -332,7 +335,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
     def __len__(self) -> int:
         """
-        The size of the upload queue
+        The size of the upload queue.
         Returns:
             Number of data points in queue
@@ -341,6 +344,21 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
 class SequenceUploadQueue(AbstractUploadQueue):
+    """
+    Upload queue for sequences.
+    Args:
+        cdf_client: Cognite Data Fusion client to use
+        post_upload_function: A function that will be called after each upload. The function will be given one
+            argument: A list of the events that were uploaded.
+        max_queue_size: Maximum size of upload queue. Defaults to no max size.
+        max_upload_interval: Automatically trigger an upload each m seconds when run as a thread (use start/stop
+            methods).
+        trigger_log_level: Log level to log upload triggers to.
+        thread_name: Thread name of uploader thread.
+        create_missing: Create missing sequences if possible (ie, if external id is used).
+    """
     def __init__(
         self,
         cdf_client: CogniteClient,
@@ -352,19 +370,6 @@ class SequenceUploadQueue(AbstractUploadQueue):
         create_missing: bool = False,
         cancellation_token: CancellationToken | None = None,
     ):
-        """
-        Args:
-            cdf_client: Cognite Data Fusion client to use
-            post_upload_function: A function that will be called after each upload. The function will be given one
-                argument: A list of the events that were uploaded.
-            max_queue_size: Maximum size of upload queue. Defaults to no max size.
-            max_upload_interval: Automatically trigger an upload each m seconds when run as a thread (use start/stop
-                methods).
-            trigger_log_level: Log level to log upload triggers to.
-            thread_name: Thread name of uploader thread.
-            create_missing: Create missing sequences if possible (ie, if external id is used)
-        """
         # Super sets post_upload and threshold
         super().__init__(
             cdf_client,
@@ -393,7 +398,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
     def set_sequence_metadata(
         self,
         metadata: dict[str, str | int | float],
-        id: int | None = None,
+        id: int | None = None,  # noqa: A002
         external_id: str | None = None,
         asset_external_id: str | None = None,
         dataset_external_id: str | None = None,
@@ -401,8 +406,10 @@ class SequenceUploadQueue(AbstractUploadQueue):
         description: str | None = None,
     ) -> None:
         """
-        Set sequence metadata. Metadata will be cached until the sequence is created. The metadata will be updated
-        if the sequence already exists
+        Set sequence metadata.
+        Metadata will be cached until the sequence is created. The metadata will be updated if the sequence already
+        exists.
         Args:
             metadata: Sequence metadata
@@ -427,10 +434,13 @@ class SequenceUploadQueue(AbstractUploadQueue):
             self.sequence_descriptions[either_id] = description
     def set_sequence_column_definition(
-        self, col_def: list[dict[str, str]], id: int | None = None, external_id: str | None = None
+        self,
+        col_def: list[dict[str, str]],
+        id: int | None = None,  # noqa: A002
+        external_id: str | None = None,
     ) -> None:
         """
-        Set sequence column definition
+        Set sequence column definition.
         Args:
             col_def: Sequence column definition
@@ -450,12 +460,13 @@ class SequenceUploadQueue(AbstractUploadQueue):
         | SequenceData
         | SequenceRows,
         column_external_ids: list[dict] | None = None,
-        id: int | None = None,
+        id: int | None = None,  # noqa: A002
         external_id: str | None = None,
     ) -> None:
         """
-        Add sequence rows to upload queue. Mirrors implementation of SequenceApi.insert. Inserted rows will be
-        cached until uploaded
+        Add sequence rows to upload queue.
+        Mirrors implementation of SequenceApi.insert. Inserted rows will be cached until uploaded.
         Args:
             rows: The rows to be inserted. Can either be a list of tuples, a list of ["rownumber": ..., "values": ...]
@@ -466,7 +477,6 @@ class SequenceUploadQueue(AbstractUploadQueue):
             external_id: Sequence external ID
                 Us if id is None
         """
         if len(rows) == 0:
             pass
@@ -509,7 +519,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
     def upload(self) -> None:
         """
-        Trigger an upload of the queue, clears queue afterwards
+        Trigger an upload of the queue, clears queue afterwards.
         """
         @retry(
@@ -571,15 +581,14 @@ class SequenceUploadQueue(AbstractUploadQueue):
     def _create_or_update(self, either_id: EitherId) -> None:
         """
-        Create or update sequence, based on provided metadata and column definitions
+        Create or update sequence, based on provided metadata and column definitions.
         Args:
             either_id: Id/External Id of sequence to be updated
         """
         column_def = self.column_definitions.get(either_id)
         if column_def is None:
-            self.logger.error(f"Can't create sequence {str(either_id)}, no column definitions provided")
+            self.logger.error(f"Can't create sequence {either_id!s}, no column definitions provided")
         try:
             seq = self.cdf_client.sequences.create(
@@ -596,7 +605,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
             )
         except CogniteDuplicatedError:
-            self.logger.info(f"Sequnce already exist: {either_id}")
+            self.logger.info(f"Sequence already exist: {either_id}")
             seq = self.cdf_client.sequences.retrieve(  # type: ignore [assignment]
                 id=either_id.internal_id,
                 external_id=either_id.external_id,
@@ -608,7 +617,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
     def _resolve_asset_ids(self) -> None:
         """
-        Resolve id of assets if specified, for use in sequence creation
+        Resolve id of assets if specified, for use in sequence creation.
         """
         assets = set(self.sequence_asset_external_ids.values())
         assets.discard(None)  # type: ignore  # safeguard, remove Nones if any
@@ -628,7 +637,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
     def _resolve_dataset_ids(self) -> None:
         """
-        Resolve id of datasets if specified, for use in sequence creation
+        Resolve id of datasets if specified, for use in sequence creation.
         """
         datasets = set(self.sequence_dataset_external_ids.values())
         datasets.discard(None)  # type: ignore  # safeguard, remove Nones if any
@@ -648,7 +657,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
     def __enter__(self) -> "SequenceUploadQueue":
         """
-        Wraps around start method, for use as context manager
+        Wraps around start method, for use as context manager.
         Returns:
             self
@@ -660,7 +669,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
         self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
     ) -> None:
         """
-        Wraps around stop method, for use as context manager
+        Wraps around stop method, for use as context manager.
         Args:
             exc_type: Exception type
@@ -671,7 +680,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
     def __len__(self) -> int:
         """
-        The size of the upload queue
+        The size of the upload queue.
         Returns:
             Number of data points in queue

cognite/extractorutils/uploader/upload_failure_handler.py CHANGED Viewed

@@ -1,19 +1,34 @@
+"""
+This module provides a mechanism to handle file upload failures by logging details to a newline delimited JSON file.
+"""
 from collections.abc import Iterator
-from datetime import datetime
+from datetime import datetime, timezone
 import jsonlines
 class FileErrorMapping:
+    """
+    A class to represent a mapping of file name to its error reason.
+    """
     def __init__(self, file_name: str, error_reason: str) -> None:
         self.file_name = file_name
         self.error_reason = error_reason
     def __iter__(self) -> Iterator[list[str]]:
+        """
+        Returns an single-item iterator containing the file name and error reason.
+        """
         return iter([[self.file_name, self.error_reason]])
 class FileFailureManager:
+    """
+    A class to manage file upload failures by logging them to a newline delimited JSON file.
+    """
     MAX_QUEUE_SIZE = 500
     START_TIME_KEY = "start_time"
     FILE_REASON_MAP_KEY = "file_error_reason_map"
@@ -22,7 +37,7 @@ class FileFailureManager:
         self.failure_logs: dict[str, str] = {}
         self.path_to_failure_log: str = self._pre_process_file_extension(path_to_file)
-        self.start_time = start_time or str(datetime.now())
+        self.start_time = start_time or str(datetime.now(tz=timezone.utc))
         self._initialize_failure_logs()
     def _pre_process_file_extension(self, path_to_file: str | None) -> str:
@@ -34,13 +49,28 @@ class FileFailureManager:
         self.failure_logs = {}
     def __len__(self) -> int:
+        """
+        Returns the number of failure logs currently stored.
+        """
         return len(self.failure_logs)
     def clear(self) -> None:
+        """
+        Clears the queue of failure logs.
+        """
         self.failure_logs.clear()
         self._initialize_failure_logs()
     def add(self, file_name: str, error_reason: str) -> None:
+        """
+        Adds a file name and its error reason to the failure logs.
+        If the number of logs exceeds the maximum queue size, it writes the logs to a file.
+        Args:
+            file_name: The name of the file that failed to upload.
+            error_reason: The reason for the failure.
+        """
         error_file_object = FileErrorMapping(file_name=file_name, error_reason=error_reason)
         error_file_dict = dict(error_file_object)
@@ -50,6 +80,9 @@ class FileFailureManager:
             self.write_to_file()
     def write_to_file(self) -> None:
+        """
+        Flushes the current failure logs to a newline delimited JSON file and clears the queue.
+        """
         if len(self) == 0:
             return

cognite/extractorutils/uploader_extractor.py CHANGED Viewed

@@ -1,3 +1,9 @@
+"""
+DEPRECATED. Use the normal base class and instantiate the upload queues manually.
+A module containing a version of the Extractor class with pre-defined upload queues.
+"""
 #  Copyright 2022 Cognite AS
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,12 +18,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-"""
-A module containing a slightly more advanced base extractor class, sorting a generic output into upload queues.
-"""
 from collections.abc import Callable, Iterable
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from types import TracebackType
 from typing import Any, TypeVar
@@ -35,14 +37,22 @@ from cognite.extractorutils.uploader_types import CdfTypes, Event, InsertDatapoi
 @dataclass
 class QueueConfigClass:
+    """
+    Configuration for several upload queues.
+    """
     event_size: int = 10_000
     raw_size: int = 50_000
     timeseries_size: int = 1_000_000
-    upload_interval: TimeIntervalConfig = TimeIntervalConfig("1m")
+    upload_interval: TimeIntervalConfig = field(default_factory=lambda: TimeIntervalConfig("1m"))
 @dataclass
 class UploaderExtractorConfig(BaseConfig):
+    """
+    Base configuration for the UploaderExtractor.
+    """
     queues: QueueConfigClass | None
@@ -108,6 +118,13 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
         self.middleware = middleware if isinstance(middleware, list) else []
     def handle_output(self, output: CdfTypes) -> None:
+        """
+        Handle the output of the extractor and sort it into appropriate upload queues.
+        Args:
+            output: The output from the extractor, which can be an Event, RawRow, InsertDatapoints, or an iterable of
+                these types.
+        """
         list_output = [output] if not isinstance(output, Iterable) else output
         peekable_output = peekable(list_output)
@@ -145,6 +162,9 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
         return item
     def __enter__(self) -> "UploaderExtractor":
+        """
+        Initializes the upload queues and returns the extractor instance.
+        """
         super().__enter__()
         queue_config = self.config.queues if self.config.queues else QueueConfigClass()
@@ -173,6 +193,9 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
     def __exit__(
         self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
     ) -> bool:
+        """
+        Waits for the upload queues and exits the extractor context.
+        """
         self.event_queue.__exit__(exc_type, exc_val, exc_tb)
         self.raw_queue.__exit__(exc_type, exc_val, exc_tb)
         self.time_series_queue.__exit__(exc_type, exc_val, exc_tb)

cognite/extractorutils/uploader_types.py CHANGED Viewed

@@ -1,3 +1,9 @@
+"""
+DEPRECATED: This module is deprecated and will be removed in a future release.
+These types are used in the UploaderExtractor, as well as the REST and MQTT extensions for the extractorutils library.
+"""
 from collections.abc import Iterable
 from typing import TypeAlias
@@ -7,13 +13,21 @@ from cognite.extractorutils.uploader.time_series import DataPoint
 class InsertDatapoints:
-    def __init__(self, *, id: int | None = None, external_id: str | None = None, datapoints: list[DataPoint]):
+    """
+    A class representing a batch of datapoints to be inserted into a time series.
+    """
+    def __init__(self, *, id: int | None = None, external_id: str | None = None, datapoints: list[DataPoint]):  # noqa: A002
         self.id = id
         self.external_id = external_id
         self.datapoints = datapoints
 class RawRow:
+    """
+    A class representing a row of data to be inserted into a RAW table.
+    """
     def __init__(self, db_name: str, table_name: str, row: _Row | Iterable[_Row]):
         self.db_name = db_name
         self.table_name = table_name

cognite-extractor-utils 7.5.14__py3-none-any.whl → 7.6.0__py3-none-any.whl

Potentially problematic release.

cognite-extractor-utils 7.5.14py3-none-any.whl → 7.6.0py3-none-any.whl