cognite-extractor-utils 7.5.14__py3-none-any.whl → 7.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-extractor-utils might be problematic. Click here for more details.

Files changed (47) hide show
  1. cognite/extractorutils/__init__.py +1 -1
  2. cognite/extractorutils/_inner_util.py +1 -1
  3. cognite/extractorutils/base.py +120 -40
  4. cognite/extractorutils/configtools/__init__.py +4 -5
  5. cognite/extractorutils/configtools/_util.py +3 -2
  6. cognite/extractorutils/configtools/elements.py +206 -33
  7. cognite/extractorutils/configtools/loaders.py +68 -16
  8. cognite/extractorutils/configtools/validators.py +5 -1
  9. cognite/extractorutils/exceptions.py +11 -2
  10. cognite/extractorutils/metrics.py +17 -12
  11. cognite/extractorutils/statestore/__init__.py +77 -3
  12. cognite/extractorutils/statestore/_base.py +7 -3
  13. cognite/extractorutils/statestore/hashing.py +129 -15
  14. cognite/extractorutils/statestore/watermark.py +77 -87
  15. cognite/extractorutils/threading.py +30 -4
  16. cognite/extractorutils/unstable/__init__.py +5 -5
  17. cognite/extractorutils/unstable/configuration/__init__.py +3 -0
  18. cognite/extractorutils/unstable/configuration/exceptions.py +13 -2
  19. cognite/extractorutils/unstable/configuration/loaders.py +78 -13
  20. cognite/extractorutils/unstable/configuration/models.py +121 -7
  21. cognite/extractorutils/unstable/core/__init__.py +5 -0
  22. cognite/extractorutils/unstable/core/_dto.py +5 -3
  23. cognite/extractorutils/unstable/core/base.py +113 -4
  24. cognite/extractorutils/unstable/core/errors.py +41 -0
  25. cognite/extractorutils/unstable/core/logger.py +149 -0
  26. cognite/extractorutils/unstable/core/restart_policy.py +16 -2
  27. cognite/extractorutils/unstable/core/runtime.py +44 -6
  28. cognite/extractorutils/unstable/core/tasks.py +53 -1
  29. cognite/extractorutils/unstable/scheduling/__init__.py +13 -0
  30. cognite/extractorutils/unstable/scheduling/_scheduler.py +1 -1
  31. cognite/extractorutils/uploader/__init__.py +7 -5
  32. cognite/extractorutils/uploader/_base.py +4 -5
  33. cognite/extractorutils/uploader/assets.py +13 -8
  34. cognite/extractorutils/uploader/data_modeling.py +37 -2
  35. cognite/extractorutils/uploader/events.py +14 -9
  36. cognite/extractorutils/uploader/files.py +80 -21
  37. cognite/extractorutils/uploader/raw.py +12 -7
  38. cognite/extractorutils/uploader/time_series.py +58 -49
  39. cognite/extractorutils/uploader/upload_failure_handler.py +35 -2
  40. cognite/extractorutils/uploader_extractor.py +29 -6
  41. cognite/extractorutils/uploader_types.py +15 -1
  42. cognite/extractorutils/util.py +76 -23
  43. {cognite_extractor_utils-7.5.14.dist-info → cognite_extractor_utils-7.6.0.dist-info}/METADATA +1 -1
  44. cognite_extractor_utils-7.6.0.dist-info/RECORD +50 -0
  45. cognite_extractor_utils-7.5.14.dist-info/RECORD +0 -50
  46. {cognite_extractor_utils-7.5.14.dist-info → cognite_extractor_utils-7.6.0.dist-info}/WHEEL +0 -0
  47. {cognite_extractor_utils-7.5.14.dist-info → cognite_extractor_utils-7.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,6 @@
1
+ """
2
+ Upload queue for time series and sequences.
3
+ """
1
4
  # Copyright 2023 Cognite AS
2
5
  #
3
6
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -81,7 +84,7 @@ def default_time_series_factory(external_id: str, datapoints: DataPointList) ->
81
84
 
82
85
  class TimeSeriesUploadQueue(AbstractUploadQueue):
83
86
  """
84
- Upload queue for time series
87
+ Upload queue for time series.
85
88
 
86
89
  Args:
87
90
  cdf_client: Cognite Data Fusion client to use
@@ -141,7 +144,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
141
144
  self.data_set_id = data_set_id
142
145
 
143
146
  def _verify_datapoint_time(self, time: int | float | datetime | str) -> bool:
144
- if isinstance(time, int) or isinstance(time, float):
147
+ if isinstance(time, int | float):
145
148
  return not math.isnan(time) and time >= MIN_DATAPOINT_TIMESTAMP
146
149
  elif isinstance(time, str):
147
150
  return False
@@ -155,10 +158,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
155
158
  )
156
159
  elif isinstance(value, str):
157
160
  return len(value) <= MAX_DATAPOINT_STRING_LENGTH
158
- elif isinstance(value, datetime):
159
- return False
160
- else:
161
- return True
161
+ return not isinstance(value, datetime)
162
162
 
163
163
  def _is_datapoint_valid(
164
164
  self,
@@ -172,11 +172,16 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
172
172
  return True
173
173
 
174
174
  def add_to_upload_queue(
175
- self, *, id: int | None = None, external_id: str | None = None, datapoints: DataPointList | None = None
175
+ self,
176
+ *,
177
+ id: int | None = None, # noqa: A002
178
+ external_id: str | None = None,
179
+ datapoints: DataPointList | None = None,
176
180
  ) -> None:
177
181
  """
178
- Add data points to upload queue. The queue will be uploaded if the queue size is larger than the threshold
179
- specified in the __init__.
182
+ Add data points to upload queue.
183
+
184
+ The queue will be uploaded if the queue size is larger than the threshold specified in the ``__init__``.
180
185
 
181
186
  Args:
182
187
  id: Internal ID of time series. Either this or external_id must be set.
@@ -209,7 +214,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
209
214
 
210
215
  def upload(self) -> None:
211
216
  """
212
- Trigger an upload of the queue, clears queue afterwards
217
+ Trigger an upload of the queue, clears queue afterwards.
213
218
  """
214
219
 
215
220
  @retry(
@@ -239,9 +244,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
239
244
 
240
245
  if self.create_missing:
241
246
  # Get the time series that can be created
242
- create_these_ids = set(
243
- [id_dict["externalId"] for id_dict in ex.not_found if "externalId" in id_dict]
244
- )
247
+ create_these_ids = {id_dict["externalId"] for id_dict in ex.not_found if "externalId" in id_dict}
245
248
  datapoints_lists: dict[str, DataPointList] = {
246
249
  ts_dict["externalId"]: ts_dict["datapoints"]
247
250
  for ts_dict in upload_this
@@ -294,7 +297,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
294
297
  ]
295
298
  )
296
299
 
297
- for _either_id, datapoints in self.upload_queue.items():
300
+ for datapoints in self.upload_queue.values():
298
301
  self.points_written.inc(len(datapoints))
299
302
 
300
303
  try:
@@ -309,7 +312,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
309
312
 
310
313
  def __enter__(self) -> "TimeSeriesUploadQueue":
311
314
  """
312
- Wraps around start method, for use as context manager
315
+ Wraps around start method, for use as context manager.
313
316
 
314
317
  Returns:
315
318
  self
@@ -321,7 +324,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
321
324
  self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
322
325
  ) -> None:
323
326
  """
324
- Wraps around stop method, for use as context manager
327
+ Wraps around stop method, for use as context manager.
325
328
 
326
329
  Args:
327
330
  exc_type: Exception type
@@ -332,7 +335,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
332
335
 
333
336
  def __len__(self) -> int:
334
337
  """
335
- The size of the upload queue
338
+ The size of the upload queue.
336
339
 
337
340
  Returns:
338
341
  Number of data points in queue
@@ -341,6 +344,21 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
341
344
 
342
345
 
343
346
  class SequenceUploadQueue(AbstractUploadQueue):
347
+ """
348
+ Upload queue for sequences.
349
+
350
+ Args:
351
+ cdf_client: Cognite Data Fusion client to use
352
+ post_upload_function: A function that will be called after each upload. The function will be given one
353
+ argument: A list of the events that were uploaded.
354
+ max_queue_size: Maximum size of upload queue. Defaults to no max size.
355
+ max_upload_interval: Automatically trigger an upload each m seconds when run as a thread (use start/stop
356
+ methods).
357
+ trigger_log_level: Log level to log upload triggers to.
358
+ thread_name: Thread name of uploader thread.
359
+ create_missing: Create missing sequences if possible (ie, if external id is used).
360
+ """
361
+
344
362
  def __init__(
345
363
  self,
346
364
  cdf_client: CogniteClient,
@@ -352,19 +370,6 @@ class SequenceUploadQueue(AbstractUploadQueue):
352
370
  create_missing: bool = False,
353
371
  cancellation_token: CancellationToken | None = None,
354
372
  ):
355
- """
356
- Args:
357
- cdf_client: Cognite Data Fusion client to use
358
- post_upload_function: A function that will be called after each upload. The function will be given one
359
- argument: A list of the events that were uploaded.
360
- max_queue_size: Maximum size of upload queue. Defaults to no max size.
361
- max_upload_interval: Automatically trigger an upload each m seconds when run as a thread (use start/stop
362
- methods).
363
- trigger_log_level: Log level to log upload triggers to.
364
- thread_name: Thread name of uploader thread.
365
- create_missing: Create missing sequences if possible (ie, if external id is used)
366
- """
367
-
368
373
  # Super sets post_upload and threshold
369
374
  super().__init__(
370
375
  cdf_client,
@@ -393,7 +398,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
393
398
  def set_sequence_metadata(
394
399
  self,
395
400
  metadata: dict[str, str | int | float],
396
- id: int | None = None,
401
+ id: int | None = None, # noqa: A002
397
402
  external_id: str | None = None,
398
403
  asset_external_id: str | None = None,
399
404
  dataset_external_id: str | None = None,
@@ -401,8 +406,10 @@ class SequenceUploadQueue(AbstractUploadQueue):
401
406
  description: str | None = None,
402
407
  ) -> None:
403
408
  """
404
- Set sequence metadata. Metadata will be cached until the sequence is created. The metadata will be updated
405
- if the sequence already exists
409
+ Set sequence metadata.
410
+
411
+ Metadata will be cached until the sequence is created. The metadata will be updated if the sequence already
412
+ exists.
406
413
 
407
414
  Args:
408
415
  metadata: Sequence metadata
@@ -427,10 +434,13 @@ class SequenceUploadQueue(AbstractUploadQueue):
427
434
  self.sequence_descriptions[either_id] = description
428
435
 
429
436
  def set_sequence_column_definition(
430
- self, col_def: list[dict[str, str]], id: int | None = None, external_id: str | None = None
437
+ self,
438
+ col_def: list[dict[str, str]],
439
+ id: int | None = None, # noqa: A002
440
+ external_id: str | None = None,
431
441
  ) -> None:
432
442
  """
433
- Set sequence column definition
443
+ Set sequence column definition.
434
444
 
435
445
  Args:
436
446
  col_def: Sequence column definition
@@ -450,12 +460,13 @@ class SequenceUploadQueue(AbstractUploadQueue):
450
460
  | SequenceData
451
461
  | SequenceRows,
452
462
  column_external_ids: list[dict] | None = None,
453
- id: int | None = None,
463
+ id: int | None = None, # noqa: A002
454
464
  external_id: str | None = None,
455
465
  ) -> None:
456
466
  """
457
- Add sequence rows to upload queue. Mirrors implementation of SequenceApi.insert. Inserted rows will be
458
- cached until uploaded
467
+ Add sequence rows to upload queue.
468
+
469
+ Mirrors implementation of SequenceApi.insert. Inserted rows will be cached until uploaded.
459
470
 
460
471
  Args:
461
472
  rows: The rows to be inserted. Can either be a list of tuples, a list of ["rownumber": ..., "values": ...]
@@ -466,7 +477,6 @@ class SequenceUploadQueue(AbstractUploadQueue):
466
477
  external_id: Sequence external ID
467
478
  Us if id is None
468
479
  """
469
-
470
480
  if len(rows) == 0:
471
481
  pass
472
482
 
@@ -509,7 +519,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
509
519
 
510
520
  def upload(self) -> None:
511
521
  """
512
- Trigger an upload of the queue, clears queue afterwards
522
+ Trigger an upload of the queue, clears queue afterwards.
513
523
  """
514
524
 
515
525
  @retry(
@@ -571,15 +581,14 @@ class SequenceUploadQueue(AbstractUploadQueue):
571
581
 
572
582
  def _create_or_update(self, either_id: EitherId) -> None:
573
583
  """
574
- Create or update sequence, based on provided metadata and column definitions
584
+ Create or update sequence, based on provided metadata and column definitions.
575
585
 
576
586
  Args:
577
587
  either_id: Id/External Id of sequence to be updated
578
588
  """
579
-
580
589
  column_def = self.column_definitions.get(either_id)
581
590
  if column_def is None:
582
- self.logger.error(f"Can't create sequence {str(either_id)}, no column definitions provided")
591
+ self.logger.error(f"Can't create sequence {either_id!s}, no column definitions provided")
583
592
 
584
593
  try:
585
594
  seq = self.cdf_client.sequences.create(
@@ -596,7 +605,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
596
605
  )
597
606
 
598
607
  except CogniteDuplicatedError:
599
- self.logger.info(f"Sequnce already exist: {either_id}")
608
+ self.logger.info(f"Sequence already exist: {either_id}")
600
609
  seq = self.cdf_client.sequences.retrieve( # type: ignore [assignment]
601
610
  id=either_id.internal_id,
602
611
  external_id=either_id.external_id,
@@ -608,7 +617,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
608
617
 
609
618
  def _resolve_asset_ids(self) -> None:
610
619
  """
611
- Resolve id of assets if specified, for use in sequence creation
620
+ Resolve id of assets if specified, for use in sequence creation.
612
621
  """
613
622
  assets = set(self.sequence_asset_external_ids.values())
614
623
  assets.discard(None) # type: ignore # safeguard, remove Nones if any
@@ -628,7 +637,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
628
637
 
629
638
  def _resolve_dataset_ids(self) -> None:
630
639
  """
631
- Resolve id of datasets if specified, for use in sequence creation
640
+ Resolve id of datasets if specified, for use in sequence creation.
632
641
  """
633
642
  datasets = set(self.sequence_dataset_external_ids.values())
634
643
  datasets.discard(None) # type: ignore # safeguard, remove Nones if any
@@ -648,7 +657,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
648
657
 
649
658
  def __enter__(self) -> "SequenceUploadQueue":
650
659
  """
651
- Wraps around start method, for use as context manager
660
+ Wraps around start method, for use as context manager.
652
661
 
653
662
  Returns:
654
663
  self
@@ -660,7 +669,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
660
669
  self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
661
670
  ) -> None:
662
671
  """
663
- Wraps around stop method, for use as context manager
672
+ Wraps around stop method, for use as context manager.
664
673
 
665
674
  Args:
666
675
  exc_type: Exception type
@@ -671,7 +680,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
671
680
 
672
681
  def __len__(self) -> int:
673
682
  """
674
- The size of the upload queue
683
+ The size of the upload queue.
675
684
 
676
685
  Returns:
677
686
  Number of data points in queue
@@ -1,19 +1,34 @@
1
+ """
2
+ This module provides a mechanism to handle file upload failures by logging details to a newline delimited JSON file.
3
+ """
4
+
1
5
  from collections.abc import Iterator
2
- from datetime import datetime
6
+ from datetime import datetime, timezone
3
7
 
4
8
  import jsonlines
5
9
 
6
10
 
7
11
  class FileErrorMapping:
12
+ """
13
+ A class to represent a mapping of file name to its error reason.
14
+ """
15
+
8
16
  def __init__(self, file_name: str, error_reason: str) -> None:
9
17
  self.file_name = file_name
10
18
  self.error_reason = error_reason
11
19
 
12
20
  def __iter__(self) -> Iterator[list[str]]:
21
+ """
22
+ Returns an single-item iterator containing the file name and error reason.
23
+ """
13
24
  return iter([[self.file_name, self.error_reason]])
14
25
 
15
26
 
16
27
  class FileFailureManager:
28
+ """
29
+ A class to manage file upload failures by logging them to a newline delimited JSON file.
30
+ """
31
+
17
32
  MAX_QUEUE_SIZE = 500
18
33
  START_TIME_KEY = "start_time"
19
34
  FILE_REASON_MAP_KEY = "file_error_reason_map"
@@ -22,7 +37,7 @@ class FileFailureManager:
22
37
  self.failure_logs: dict[str, str] = {}
23
38
 
24
39
  self.path_to_failure_log: str = self._pre_process_file_extension(path_to_file)
25
- self.start_time = start_time or str(datetime.now())
40
+ self.start_time = start_time or str(datetime.now(tz=timezone.utc))
26
41
  self._initialize_failure_logs()
27
42
 
28
43
  def _pre_process_file_extension(self, path_to_file: str | None) -> str:
@@ -34,13 +49,28 @@ class FileFailureManager:
34
49
  self.failure_logs = {}
35
50
 
36
51
  def __len__(self) -> int:
52
+ """
53
+ Returns the number of failure logs currently stored.
54
+ """
37
55
  return len(self.failure_logs)
38
56
 
39
57
  def clear(self) -> None:
58
+ """
59
+ Clears the queue of failure logs.
60
+ """
40
61
  self.failure_logs.clear()
41
62
  self._initialize_failure_logs()
42
63
 
43
64
  def add(self, file_name: str, error_reason: str) -> None:
65
+ """
66
+ Adds a file name and its error reason to the failure logs.
67
+
68
+ If the number of logs exceeds the maximum queue size, it writes the logs to a file.
69
+
70
+ Args:
71
+ file_name: The name of the file that failed to upload.
72
+ error_reason: The reason for the failure.
73
+ """
44
74
  error_file_object = FileErrorMapping(file_name=file_name, error_reason=error_reason)
45
75
  error_file_dict = dict(error_file_object)
46
76
 
@@ -50,6 +80,9 @@ class FileFailureManager:
50
80
  self.write_to_file()
51
81
 
52
82
  def write_to_file(self) -> None:
83
+ """
84
+ Flushes the current failure logs to a newline delimited JSON file and clears the queue.
85
+ """
53
86
  if len(self) == 0:
54
87
  return
55
88
 
@@ -1,3 +1,9 @@
1
+ """
2
+ DEPRECATED. Use the normal base class and instantiate the upload queues manually.
3
+
4
+ A module containing a version of the Extractor class with pre-defined upload queues.
5
+ """
6
+
1
7
  # Copyright 2022 Cognite AS
2
8
  #
3
9
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,12 +18,8 @@
12
18
  # See the License for the specific language governing permissions and
13
19
  # limitations under the License.
14
20
 
15
- """
16
- A module containing a slightly more advanced base extractor class, sorting a generic output into upload queues.
17
- """
18
-
19
21
  from collections.abc import Callable, Iterable
20
- from dataclasses import dataclass
22
+ from dataclasses import dataclass, field
21
23
  from types import TracebackType
22
24
  from typing import Any, TypeVar
23
25
 
@@ -35,14 +37,22 @@ from cognite.extractorutils.uploader_types import CdfTypes, Event, InsertDatapoi
35
37
 
36
38
  @dataclass
37
39
  class QueueConfigClass:
40
+ """
41
+ Configuration for several upload queues.
42
+ """
43
+
38
44
  event_size: int = 10_000
39
45
  raw_size: int = 50_000
40
46
  timeseries_size: int = 1_000_000
41
- upload_interval: TimeIntervalConfig = TimeIntervalConfig("1m")
47
+ upload_interval: TimeIntervalConfig = field(default_factory=lambda: TimeIntervalConfig("1m"))
42
48
 
43
49
 
44
50
  @dataclass
45
51
  class UploaderExtractorConfig(BaseConfig):
52
+ """
53
+ Base configuration for the UploaderExtractor.
54
+ """
55
+
46
56
  queues: QueueConfigClass | None
47
57
 
48
58
 
@@ -108,6 +118,13 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
108
118
  self.middleware = middleware if isinstance(middleware, list) else []
109
119
 
110
120
  def handle_output(self, output: CdfTypes) -> None:
121
+ """
122
+ Handle the output of the extractor and sort it into appropriate upload queues.
123
+
124
+ Args:
125
+ output: The output from the extractor, which can be an Event, RawRow, InsertDatapoints, or an iterable of
126
+ these types.
127
+ """
111
128
  list_output = [output] if not isinstance(output, Iterable) else output
112
129
  peekable_output = peekable(list_output)
113
130
 
@@ -145,6 +162,9 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
145
162
  return item
146
163
 
147
164
  def __enter__(self) -> "UploaderExtractor":
165
+ """
166
+ Initializes the upload queues and returns the extractor instance.
167
+ """
148
168
  super().__enter__()
149
169
 
150
170
  queue_config = self.config.queues if self.config.queues else QueueConfigClass()
@@ -173,6 +193,9 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
173
193
  def __exit__(
174
194
  self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
175
195
  ) -> bool:
196
+ """
197
+ Waits for the upload queues and exits the extractor context.
198
+ """
176
199
  self.event_queue.__exit__(exc_type, exc_val, exc_tb)
177
200
  self.raw_queue.__exit__(exc_type, exc_val, exc_tb)
178
201
  self.time_series_queue.__exit__(exc_type, exc_val, exc_tb)
@@ -1,3 +1,9 @@
1
+ """
2
+ DEPRECATED: This module is deprecated and will be removed in a future release.
3
+
4
+ These types are used in the UploaderExtractor, as well as the REST and MQTT extensions for the extractorutils library.
5
+ """
6
+
1
7
  from collections.abc import Iterable
2
8
  from typing import TypeAlias
3
9
 
@@ -7,13 +13,21 @@ from cognite.extractorutils.uploader.time_series import DataPoint
7
13
 
8
14
 
9
15
  class InsertDatapoints:
10
- def __init__(self, *, id: int | None = None, external_id: str | None = None, datapoints: list[DataPoint]):
16
+ """
17
+ A class representing a batch of datapoints to be inserted into a time series.
18
+ """
19
+
20
+ def __init__(self, *, id: int | None = None, external_id: str | None = None, datapoints: list[DataPoint]): # noqa: A002
11
21
  self.id = id
12
22
  self.external_id = external_id
13
23
  self.datapoints = datapoints
14
24
 
15
25
 
16
26
  class RawRow:
27
+ """
28
+ A class representing a row of data to be inserted into a RAW table.
29
+ """
30
+
17
31
  def __init__(self, db_name: str, table_name: str, row: _Row | Iterable[_Row]):
18
32
  self.db_name = db_name
19
33
  self.table_name = table_name