cognite-extractor-utils 7.6.0__py3-none-any.whl → 7.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cognite-extractor-utils might be problematic. Click here for more details.
- cognite/examples/unstable/extractors/simple_extractor/config/config.yaml +3 -0
- cognite/examples/unstable/extractors/simple_extractor/config/connection_config.yaml +10 -0
- cognite/examples/unstable/extractors/simple_extractor/main.py +81 -0
- cognite/extractorutils/__init__.py +1 -1
- cognite/extractorutils/_inner_util.py +2 -2
- cognite/extractorutils/base.py +1 -1
- cognite/extractorutils/configtools/elements.py +4 -2
- cognite/extractorutils/configtools/loaders.py +3 -3
- cognite/extractorutils/exceptions.py +1 -1
- cognite/extractorutils/metrics.py +8 -6
- cognite/extractorutils/statestore/watermark.py +6 -3
- cognite/extractorutils/threading.py +2 -2
- cognite/extractorutils/unstable/configuration/exceptions.py +28 -1
- cognite/extractorutils/unstable/configuration/models.py +157 -32
- cognite/extractorutils/unstable/core/_dto.py +80 -7
- cognite/extractorutils/unstable/core/base.py +175 -106
- cognite/extractorutils/unstable/core/checkin_worker.py +428 -0
- cognite/extractorutils/unstable/core/errors.py +2 -2
- cognite/extractorutils/unstable/core/logger.py +49 -0
- cognite/extractorutils/unstable/core/runtime.py +200 -31
- cognite/extractorutils/unstable/core/tasks.py +2 -2
- cognite/extractorutils/uploader/__init__.py +2 -0
- cognite/extractorutils/uploader/_base.py +1 -1
- cognite/extractorutils/uploader/assets.py +1 -1
- cognite/extractorutils/uploader/data_modeling.py +1 -1
- cognite/extractorutils/uploader/events.py +1 -1
- cognite/extractorutils/uploader/files.py +4 -4
- cognite/extractorutils/uploader/raw.py +1 -1
- cognite/extractorutils/uploader/time_series.py +319 -52
- cognite/extractorutils/uploader_extractor.py +20 -5
- cognite/extractorutils/uploader_types.py +13 -2
- cognite/extractorutils/util.py +8 -6
- {cognite_extractor_utils-7.6.0.dist-info → cognite_extractor_utils-7.8.0.dist-info}/METADATA +3 -2
- cognite_extractor_utils-7.8.0.dist-info/RECORD +55 -0
- cognite_extractor_utils-7.8.0.dist-info/entry_points.txt +2 -0
- cognite_extractor_utils-7.6.0.dist-info/RECORD +0 -50
- {cognite_extractor_utils-7.6.0.dist-info → cognite_extractor_utils-7.8.0.dist-info}/WHEEL +0 -0
- {cognite_extractor_utils-7.6.0.dist-info → cognite_extractor_utils-7.8.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -19,7 +19,7 @@ import math
|
|
|
19
19
|
from collections.abc import Callable
|
|
20
20
|
from datetime import datetime
|
|
21
21
|
from types import TracebackType
|
|
22
|
-
from typing import Any
|
|
22
|
+
from typing import Any, Generic, Literal, TypedDict, TypeVar
|
|
23
23
|
|
|
24
24
|
from cognite.client import CogniteClient
|
|
25
25
|
from cognite.client.data_classes import (
|
|
@@ -29,6 +29,9 @@ from cognite.client.data_classes import (
|
|
|
29
29
|
StatusCode,
|
|
30
30
|
TimeSeries,
|
|
31
31
|
)
|
|
32
|
+
from cognite.client.data_classes.data_modeling import NodeId
|
|
33
|
+
from cognite.client.data_classes.data_modeling.extractor_extensions.v1 import CogniteExtractorTimeSeriesApply
|
|
34
|
+
from cognite.client.data_classes.data_modeling.instances import DirectRelationReference
|
|
32
35
|
from cognite.client.exceptions import CogniteDuplicatedError, CogniteNotFoundError
|
|
33
36
|
from cognite.extractorutils.threading import CancellationToken
|
|
34
37
|
from cognite.extractorutils.uploader._base import (
|
|
@@ -62,6 +65,18 @@ DataPointWithStatus = tuple[TimeStamp, float, FullStatusCode] | tuple[TimeStamp,
|
|
|
62
65
|
DataPoint = DataPointWithoutStatus | DataPointWithStatus
|
|
63
66
|
DataPointList = list[DataPoint]
|
|
64
67
|
|
|
68
|
+
TQueue = TypeVar("TQueue", bound="BaseTimeSeriesUploadQueue")
|
|
69
|
+
IdType = TypeVar("IdType", EitherId, NodeId)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class CdmDatapointsPayload(TypedDict):
|
|
73
|
+
"""
|
|
74
|
+
Represents a payload for CDF datapoints, linking them to a specific instance.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
instanceId: NodeId
|
|
78
|
+
datapoints: DataPointList
|
|
79
|
+
|
|
65
80
|
|
|
66
81
|
def default_time_series_factory(external_id: str, datapoints: DataPointList) -> TimeSeries:
|
|
67
82
|
"""
|
|
@@ -82,9 +97,9 @@ def default_time_series_factory(external_id: str, datapoints: DataPointList) ->
|
|
|
82
97
|
return TimeSeries(external_id=external_id, is_string=is_string)
|
|
83
98
|
|
|
84
99
|
|
|
85
|
-
class
|
|
100
|
+
class BaseTimeSeriesUploadQueue(AbstractUploadQueue, Generic[IdType]):
|
|
86
101
|
"""
|
|
87
|
-
|
|
102
|
+
Abstract base upload queue for time series.
|
|
88
103
|
|
|
89
104
|
Args:
|
|
90
105
|
cdf_client: Cognite Data Fusion client to use
|
|
@@ -96,12 +111,6 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
|
|
|
96
111
|
methods).
|
|
97
112
|
trigger_log_level: Log level to log upload triggers to.
|
|
98
113
|
thread_name: Thread name of uploader thread.
|
|
99
|
-
create_missing: Create missing time series if possible (ie, if external id is used). Either given as a boolean
|
|
100
|
-
(True would auto-create a time series with nothing but an external ID), or as a factory function taking an
|
|
101
|
-
external ID and a list of datapoints about to be inserted and returning a TimeSeries object.
|
|
102
|
-
data_set_id: Data set id passed to create_missing. Does nothing if create_missing is False.
|
|
103
|
-
If a custom timeseries creation method is set in create_missing, this is used as fallback if
|
|
104
|
-
that method does not set data set id on its own.
|
|
105
114
|
"""
|
|
106
115
|
|
|
107
116
|
def __init__(
|
|
@@ -112,10 +121,8 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
|
|
|
112
121
|
max_upload_interval: int | None = None,
|
|
113
122
|
trigger_log_level: str = "DEBUG",
|
|
114
123
|
thread_name: str | None = None,
|
|
115
|
-
create_missing: Callable[[str, DataPointList], TimeSeries] | bool = False,
|
|
116
|
-
data_set_id: int | None = None,
|
|
117
124
|
cancellation_token: CancellationToken | None = None,
|
|
118
|
-
):
|
|
125
|
+
) -> None:
|
|
119
126
|
# Super sets post_upload and threshold
|
|
120
127
|
super().__init__(
|
|
121
128
|
cdf_client,
|
|
@@ -127,21 +134,11 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
|
|
|
127
134
|
cancellation_token,
|
|
128
135
|
)
|
|
129
136
|
|
|
130
|
-
self.
|
|
131
|
-
|
|
132
|
-
if isinstance(create_missing, bool):
|
|
133
|
-
self.create_missing = create_missing
|
|
134
|
-
self.missing_factory = default_time_series_factory
|
|
135
|
-
else:
|
|
136
|
-
self.create_missing = True
|
|
137
|
-
self.missing_factory = create_missing
|
|
138
|
-
|
|
139
|
-
self.upload_queue: dict[EitherId, DataPointList] = {}
|
|
137
|
+
self.upload_queue: dict[IdType, DataPointList] = {}
|
|
140
138
|
|
|
141
139
|
self.points_queued = TIMESERIES_UPLOADER_POINTS_QUEUED
|
|
142
140
|
self.points_written = TIMESERIES_UPLOADER_POINTS_WRITTEN
|
|
143
141
|
self.queue_size = TIMESERIES_UPLOADER_QUEUE_SIZE
|
|
144
|
-
self.data_set_id = data_set_id
|
|
145
142
|
|
|
146
143
|
def _verify_datapoint_time(self, time: int | float | datetime | str) -> bool:
|
|
147
144
|
if isinstance(time, int | float):
|
|
@@ -171,6 +168,109 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
|
|
|
171
168
|
else:
|
|
172
169
|
return True
|
|
173
170
|
|
|
171
|
+
def _sanitize_datapoints(self, datapoints: DataPointList | None) -> DataPointList:
|
|
172
|
+
datapoints = datapoints or []
|
|
173
|
+
old_len = len(datapoints)
|
|
174
|
+
datapoints = list(filter(self._is_datapoint_valid, datapoints))
|
|
175
|
+
|
|
176
|
+
new_len = len(datapoints)
|
|
177
|
+
|
|
178
|
+
if old_len > new_len:
|
|
179
|
+
diff = old_len - new_len
|
|
180
|
+
self.logger.warning(f"Discarding {diff} datapoints due to bad timestamp or value")
|
|
181
|
+
TIMESERIES_UPLOADER_POINTS_DISCARDED.inc(diff)
|
|
182
|
+
|
|
183
|
+
return datapoints
|
|
184
|
+
|
|
185
|
+
def __exit__(
|
|
186
|
+
self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
|
|
187
|
+
) -> None:
|
|
188
|
+
"""
|
|
189
|
+
Wraps around stop method, for use as context manager.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
exc_type: Exception type
|
|
193
|
+
exc_val: Exception value
|
|
194
|
+
exc_tb: Traceback
|
|
195
|
+
"""
|
|
196
|
+
self.stop()
|
|
197
|
+
|
|
198
|
+
def __len__(self) -> int:
|
|
199
|
+
"""
|
|
200
|
+
The size of the upload queue.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
Number of data points in queue
|
|
204
|
+
"""
|
|
205
|
+
return self.upload_queue_size
|
|
206
|
+
|
|
207
|
+
def __enter__(self: TQueue) -> TQueue:
|
|
208
|
+
"""
|
|
209
|
+
Wraps around start method, for use as context manager.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
self
|
|
213
|
+
"""
|
|
214
|
+
self.start()
|
|
215
|
+
return self
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class TimeSeriesUploadQueue(BaseTimeSeriesUploadQueue[EitherId]):
|
|
219
|
+
"""
|
|
220
|
+
Upload queue for time series.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
cdf_client: Cognite Data Fusion client to use
|
|
224
|
+
post_upload_function: A function that will be called after each upload. The function will be given one argument:
|
|
225
|
+
A list of dicts containing the datapoints that were uploaded (on the same format as the kwargs in
|
|
226
|
+
datapoints upload in the Cognite SDK).
|
|
227
|
+
max_queue_size: Maximum size of upload queue. Defaults to no max size.
|
|
228
|
+
max_upload_interval: Automatically trigger an upload each m seconds when run as a thread (use start/stop
|
|
229
|
+
methods).
|
|
230
|
+
trigger_log_level: Log level to log upload triggers to.
|
|
231
|
+
thread_name: Thread name of uploader thread.
|
|
232
|
+
create_missing: Create missing time series if possible (ie, if external id is used). Either given as a boolean
|
|
233
|
+
(True would auto-create a time series with nothing but an external ID), or as a factory function taking an
|
|
234
|
+
external ID and a list of datapoints about to be inserted and returning a TimeSeries object.
|
|
235
|
+
data_set_id: Data set id passed to create_missing. Does nothing if create_missing is False.
|
|
236
|
+
If a custom timeseries creation method is set in create_missing, this is used as fallback if
|
|
237
|
+
that method does not set data set id on its own.
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
def __init__(
|
|
241
|
+
self,
|
|
242
|
+
cdf_client: CogniteClient,
|
|
243
|
+
post_upload_function: Callable[[list[dict[str, str | DataPointList]]], None] | None = None,
|
|
244
|
+
max_queue_size: int | None = None,
|
|
245
|
+
max_upload_interval: int | None = None,
|
|
246
|
+
trigger_log_level: str = "DEBUG",
|
|
247
|
+
thread_name: str | None = None,
|
|
248
|
+
create_missing: Callable[[str, DataPointList], TimeSeries] | bool = False,
|
|
249
|
+
data_set_id: int | None = None,
|
|
250
|
+
cancellation_token: CancellationToken | None = None,
|
|
251
|
+
) -> None:
|
|
252
|
+
# Super sets post_upload and threshold
|
|
253
|
+
super().__init__(
|
|
254
|
+
cdf_client,
|
|
255
|
+
post_upload_function,
|
|
256
|
+
max_queue_size,
|
|
257
|
+
max_upload_interval,
|
|
258
|
+
trigger_log_level,
|
|
259
|
+
thread_name,
|
|
260
|
+
cancellation_token,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
self.missing_factory: Callable[[str, DataPointList], TimeSeries]
|
|
264
|
+
|
|
265
|
+
if isinstance(create_missing, bool):
|
|
266
|
+
self.create_missing = create_missing
|
|
267
|
+
self.missing_factory = default_time_series_factory
|
|
268
|
+
else:
|
|
269
|
+
self.create_missing = True
|
|
270
|
+
self.missing_factory = create_missing
|
|
271
|
+
|
|
272
|
+
self.data_set_id = data_set_id
|
|
273
|
+
|
|
174
274
|
def add_to_upload_queue(
|
|
175
275
|
self,
|
|
176
276
|
*,
|
|
@@ -188,16 +288,7 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
|
|
|
188
288
|
external_id: External ID of time series. Either this or external_id must be set.
|
|
189
289
|
datapoints: list of data points to add
|
|
190
290
|
"""
|
|
191
|
-
datapoints = datapoints
|
|
192
|
-
old_len = len(datapoints)
|
|
193
|
-
datapoints = list(filter(self._is_datapoint_valid, datapoints))
|
|
194
|
-
|
|
195
|
-
new_len = len(datapoints)
|
|
196
|
-
|
|
197
|
-
if old_len > new_len:
|
|
198
|
-
diff = old_len - new_len
|
|
199
|
-
self.logger.warning(f"Discarding {diff} datapoints due to bad timestamp or value")
|
|
200
|
-
TIMESERIES_UPLOADER_POINTS_DISCARDED.inc(diff)
|
|
291
|
+
datapoints = self._sanitize_datapoints(datapoints)
|
|
201
292
|
|
|
202
293
|
either_id = EitherId(id=id, external_id=external_id)
|
|
203
294
|
|
|
@@ -310,37 +401,213 @@ class TimeSeriesUploadQueue(AbstractUploadQueue):
|
|
|
310
401
|
self.upload_queue_size = 0
|
|
311
402
|
self.queue_size.set(self.upload_queue_size)
|
|
312
403
|
|
|
313
|
-
|
|
404
|
+
|
|
405
|
+
class CDMTimeSeriesUploadQueue(BaseTimeSeriesUploadQueue[NodeId]):
|
|
406
|
+
"""
|
|
407
|
+
Upload queue for CDM time series.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
cdf_client: Cognite Data Fusion client to use
|
|
411
|
+
post_upload_function: A function that will be called after each upload. The function will be given one argument:
|
|
412
|
+
A list of dicts containing the datapoints that were uploaded (on the same format as the kwargs in
|
|
413
|
+
datapoints upload in the Cognite SDK).
|
|
414
|
+
max_queue_size: Maximum size of upload queue. Defaults to no max size.
|
|
415
|
+
max_upload_interval: Automatically trigger an upload each m seconds when run as a thread (use start/stop
|
|
416
|
+
methods).
|
|
417
|
+
trigger_log_level: Log level to log upload triggers to.
|
|
418
|
+
thread_name: Thread name of uploader thread.
|
|
419
|
+
"""
|
|
420
|
+
|
|
421
|
+
def __init__(
|
|
422
|
+
self,
|
|
423
|
+
cdf_client: CogniteClient,
|
|
424
|
+
post_upload_function: Callable[[list[dict[str, str | DataPointList]]], None] | None = None,
|
|
425
|
+
max_queue_size: int | None = None,
|
|
426
|
+
max_upload_interval: int | None = None,
|
|
427
|
+
trigger_log_level: str = "DEBUG",
|
|
428
|
+
thread_name: str | None = None,
|
|
429
|
+
create_missing: Callable[[NodeId, DataPointList], CogniteExtractorTimeSeriesApply] | bool = False,
|
|
430
|
+
cancellation_token: CancellationToken | None = None,
|
|
431
|
+
source: DirectRelationReference | None = None,
|
|
432
|
+
) -> None:
|
|
433
|
+
super().__init__(
|
|
434
|
+
cdf_client,
|
|
435
|
+
post_upload_function,
|
|
436
|
+
max_queue_size,
|
|
437
|
+
max_upload_interval,
|
|
438
|
+
trigger_log_level,
|
|
439
|
+
thread_name,
|
|
440
|
+
cancellation_token,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
self.missing_factory: Callable[[NodeId, DataPointList], CogniteExtractorTimeSeriesApply]
|
|
444
|
+
self.source = source
|
|
445
|
+
|
|
446
|
+
if isinstance(create_missing, bool):
|
|
447
|
+
self.create_missing = create_missing
|
|
448
|
+
self.missing_factory = self.default_cdm_time_series_factory
|
|
449
|
+
else:
|
|
450
|
+
self.create_missing = True
|
|
451
|
+
self.missing_factory = create_missing
|
|
452
|
+
|
|
453
|
+
def default_cdm_time_series_factory(
|
|
454
|
+
self, instance_id: NodeId, datapoints: DataPointList
|
|
455
|
+
) -> CogniteExtractorTimeSeriesApply:
|
|
314
456
|
"""
|
|
315
|
-
|
|
457
|
+
Default CDM time series factory used when create_missing in a CDMTimeSeriesUploadQueue is given as a boolean.
|
|
316
458
|
|
|
459
|
+
Args:
|
|
460
|
+
instance_id: Instance ID of time series to create
|
|
461
|
+
datapoints: The list of datapoints that were tried to be inserted
|
|
462
|
+
source: The source of the time series, used for creating the DirectRelationReference
|
|
317
463
|
Returns:
|
|
318
|
-
|
|
464
|
+
A CogniteExtractorTimeSeriesApply object with instance_id set, and the is_string automatically detected
|
|
319
465
|
"""
|
|
320
|
-
|
|
321
|
-
|
|
466
|
+
is_string = (
|
|
467
|
+
isinstance(datapoints[0].get("value"), str)
|
|
468
|
+
if isinstance(datapoints[0], dict)
|
|
469
|
+
else isinstance(datapoints[0][1], str)
|
|
470
|
+
)
|
|
322
471
|
|
|
323
|
-
|
|
324
|
-
|
|
472
|
+
time_series_type: Literal["numeric", "string"] = "string" if is_string else "numeric"
|
|
473
|
+
|
|
474
|
+
return CogniteExtractorTimeSeriesApply(
|
|
475
|
+
space=instance_id.space,
|
|
476
|
+
external_id=instance_id.external_id,
|
|
477
|
+
is_step=False,
|
|
478
|
+
time_series_type=time_series_type,
|
|
479
|
+
source=self.source,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
def add_to_upload_queue(
|
|
483
|
+
self,
|
|
484
|
+
*,
|
|
485
|
+
instance_id: NodeId,
|
|
486
|
+
datapoints: DataPointList | None = None,
|
|
325
487
|
) -> None:
|
|
326
488
|
"""
|
|
327
|
-
|
|
489
|
+
Add data points to upload queue.
|
|
490
|
+
|
|
491
|
+
The queue will be uploaded if the queue size is larger than the threshold specified in the __init__.
|
|
328
492
|
|
|
329
493
|
Args:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
exc_tb: Traceback
|
|
494
|
+
instance_id: The identifier for the time series to which the datapoints belong.
|
|
495
|
+
datapoints: list of data points to add
|
|
333
496
|
"""
|
|
334
|
-
self.
|
|
497
|
+
datapoints = self._sanitize_datapoints(datapoints)
|
|
335
498
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
499
|
+
with self.lock:
|
|
500
|
+
if instance_id not in self.upload_queue:
|
|
501
|
+
self.upload_queue[instance_id] = []
|
|
339
502
|
|
|
340
|
-
|
|
341
|
-
|
|
503
|
+
self.upload_queue[instance_id].extend(datapoints)
|
|
504
|
+
self.points_queued.inc(len(datapoints))
|
|
505
|
+
self.upload_queue_size += len(datapoints)
|
|
506
|
+
self.queue_size.set(self.upload_queue_size)
|
|
507
|
+
|
|
508
|
+
self._check_triggers()
|
|
509
|
+
|
|
510
|
+
def upload(self) -> None:
|
|
511
|
+
"""
|
|
512
|
+
Trigger an upload of the queue, clears queue afterwards.
|
|
342
513
|
"""
|
|
343
|
-
|
|
514
|
+
|
|
515
|
+
@retry(
|
|
516
|
+
exceptions=cognite_exceptions(),
|
|
517
|
+
cancellation_token=self.cancellation_token,
|
|
518
|
+
tries=RETRIES,
|
|
519
|
+
delay=RETRY_DELAY,
|
|
520
|
+
max_delay=RETRY_MAX_DELAY,
|
|
521
|
+
backoff=RETRY_BACKOFF_FACTOR,
|
|
522
|
+
)
|
|
523
|
+
def _upload_batch(upload_this: list[CdmDatapointsPayload], retries: int = 5) -> list[CdmDatapointsPayload]:
|
|
524
|
+
if len(upload_this) == 0:
|
|
525
|
+
return upload_this
|
|
526
|
+
|
|
527
|
+
try:
|
|
528
|
+
self.cdf_client.time_series.data.insert_multiple(upload_this) # type: ignore[arg-type]
|
|
529
|
+
except CogniteNotFoundError as ex:
|
|
530
|
+
if not retries:
|
|
531
|
+
raise ex
|
|
532
|
+
|
|
533
|
+
if not self.create_missing:
|
|
534
|
+
self.logger.error("Could not upload data points to %s: %s", str(ex.not_found), str(ex))
|
|
535
|
+
|
|
536
|
+
# Get IDs of time series that exists, but failed because of the non-existing time series
|
|
537
|
+
retry_these = [
|
|
538
|
+
NodeId(id_dict["instanceId"]["space"], id_dict["instanceId"]["externalId"])
|
|
539
|
+
for id_dict in ex.failed
|
|
540
|
+
if id_dict not in ex.not_found
|
|
541
|
+
]
|
|
542
|
+
|
|
543
|
+
if self.create_missing:
|
|
544
|
+
# Get the time series that can be created
|
|
545
|
+
create_these_ids = {
|
|
546
|
+
NodeId(id_dict["instanceId"]["space"], id_dict["instanceId"]["externalId"])
|
|
547
|
+
for id_dict in ex.not_found
|
|
548
|
+
}
|
|
549
|
+
self.logger.info(f"Creating {len(create_these_ids)} time series")
|
|
550
|
+
|
|
551
|
+
datapoints_lists: dict[NodeId, DataPointList] = {
|
|
552
|
+
ts_dict["instanceId"]: ts_dict["datapoints"]
|
|
553
|
+
for ts_dict in upload_this
|
|
554
|
+
if ts_dict["instanceId"] in create_these_ids
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
to_create: list[CogniteExtractorTimeSeriesApply] = [
|
|
558
|
+
self.missing_factory(instance_id, datapoints_lists[instance_id])
|
|
559
|
+
for instance_id in create_these_ids
|
|
560
|
+
]
|
|
561
|
+
|
|
562
|
+
instance_result = self.cdf_client.data_modeling.instances.apply(to_create)
|
|
563
|
+
retry_these.extend([node.as_id() for node in instance_result.nodes])
|
|
564
|
+
|
|
565
|
+
if len(ex.not_found) != len(create_these_ids):
|
|
566
|
+
missing = [
|
|
567
|
+
id_dict
|
|
568
|
+
for id_dict in ex.not_found
|
|
569
|
+
if NodeId(id_dict["instanceId"]["space"], id_dict["instanceId"]["externalId"])
|
|
570
|
+
not in retry_these
|
|
571
|
+
]
|
|
572
|
+
missing_num = len(ex.not_found) - len(create_these_ids)
|
|
573
|
+
self.logger.error(
|
|
574
|
+
f"{missing_num} time series not found, and could not be created automatically: "
|
|
575
|
+
+ str(missing)
|
|
576
|
+
+ " Data will be dropped"
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
# Remove entries with non-existing time series from upload queue
|
|
580
|
+
upload_this = [entry for entry in upload_this if entry["instanceId"] in retry_these]
|
|
581
|
+
|
|
582
|
+
# Upload remaining
|
|
583
|
+
_upload_batch(upload_this, retries - 1)
|
|
584
|
+
|
|
585
|
+
return upload_this
|
|
586
|
+
|
|
587
|
+
if len(self.upload_queue) == 0:
|
|
588
|
+
return
|
|
589
|
+
|
|
590
|
+
with self.lock:
|
|
591
|
+
upload_this = _upload_batch(
|
|
592
|
+
[
|
|
593
|
+
{"instanceId": instance_id, "datapoints": list(datapoints)}
|
|
594
|
+
for instance_id, datapoints in self.upload_queue.items()
|
|
595
|
+
if len(datapoints) > 0
|
|
596
|
+
]
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
for datapoints in self.upload_queue.values():
|
|
600
|
+
self.points_written.inc(len(datapoints))
|
|
601
|
+
|
|
602
|
+
try:
|
|
603
|
+
self._post_upload(upload_this)
|
|
604
|
+
except Exception as e:
|
|
605
|
+
self.logger.error("Error in upload callback: %s", str(e))
|
|
606
|
+
|
|
607
|
+
self.upload_queue.clear()
|
|
608
|
+
self.logger.info(f"Uploaded {self.upload_queue_size} datapoints")
|
|
609
|
+
self.upload_queue_size = 0
|
|
610
|
+
self.queue_size.set(self.upload_queue_size)
|
|
344
611
|
|
|
345
612
|
|
|
346
613
|
class SequenceUploadQueue(AbstractUploadQueue):
|
|
@@ -369,7 +636,7 @@ class SequenceUploadQueue(AbstractUploadQueue):
|
|
|
369
636
|
thread_name: str | None = None,
|
|
370
637
|
create_missing: bool = False,
|
|
371
638
|
cancellation_token: CancellationToken | None = None,
|
|
372
|
-
):
|
|
639
|
+
) -> None:
|
|
373
640
|
# Super sets post_upload and threshold
|
|
374
641
|
super().__init__(
|
|
375
642
|
cdf_client,
|
|
@@ -31,8 +31,13 @@ from cognite.extractorutils.configtools import BaseConfig, TimeIntervalConfig
|
|
|
31
31
|
from cognite.extractorutils.metrics import BaseMetrics
|
|
32
32
|
from cognite.extractorutils.statestore import AbstractStateStore
|
|
33
33
|
from cognite.extractorutils.threading import CancellationToken
|
|
34
|
-
from cognite.extractorutils.uploader import
|
|
35
|
-
|
|
34
|
+
from cognite.extractorutils.uploader import (
|
|
35
|
+
CDMTimeSeriesUploadQueue,
|
|
36
|
+
EventUploadQueue,
|
|
37
|
+
RawUploadQueue,
|
|
38
|
+
TimeSeriesUploadQueue,
|
|
39
|
+
)
|
|
40
|
+
from cognite.extractorutils.uploader_types import CdfTypes, Event, InsertCDMDatapoints, InsertDatapoints, RawRow
|
|
36
41
|
|
|
37
42
|
|
|
38
43
|
@dataclass
|
|
@@ -100,7 +105,7 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
|
|
|
100
105
|
heartbeat_waiting_time: int = 600,
|
|
101
106
|
handle_interrupts: bool = True,
|
|
102
107
|
middleware: list[Callable[[dict], dict]] | None = None,
|
|
103
|
-
):
|
|
108
|
+
) -> None:
|
|
104
109
|
super().__init__(
|
|
105
110
|
name=name,
|
|
106
111
|
description=description,
|
|
@@ -153,10 +158,14 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
|
|
|
153
158
|
self.time_series_queue.add_to_upload_queue(
|
|
154
159
|
id=dp.id, external_id=dp.external_id, datapoints=dp.datapoints
|
|
155
160
|
)
|
|
161
|
+
elif isinstance(peek, InsertCDMDatapoints):
|
|
162
|
+
for dp in peekable_output:
|
|
163
|
+
if isinstance(dp, InsertCDMDatapoints):
|
|
164
|
+
self.cdm_time_series_queue.add_to_upload_queue(instance_id=dp.instance_id, datapoints=dp.datapoints)
|
|
156
165
|
else:
|
|
157
166
|
raise ValueError(f"Unexpected type: {type(peek)}")
|
|
158
167
|
|
|
159
|
-
def _apply_middleware(self, item: Any) -> Any:
|
|
168
|
+
def _apply_middleware(self, item: Any) -> Any: # noqa: ANN401
|
|
160
169
|
for mw in self.middleware:
|
|
161
170
|
item = mw(item)
|
|
162
171
|
return item
|
|
@@ -187,7 +196,12 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
|
|
|
187
196
|
trigger_log_level="INFO",
|
|
188
197
|
create_missing=True,
|
|
189
198
|
).__enter__()
|
|
190
|
-
|
|
199
|
+
self.cdm_time_series_queue = CDMTimeSeriesUploadQueue(
|
|
200
|
+
self.cognite_client,
|
|
201
|
+
max_queue_size=queue_config.timeseries_size,
|
|
202
|
+
max_upload_interval=queue_config.upload_interval.seconds,
|
|
203
|
+
trigger_log_level="INFO",
|
|
204
|
+
).__enter__()
|
|
191
205
|
return self
|
|
192
206
|
|
|
193
207
|
def __exit__(
|
|
@@ -199,4 +213,5 @@ class UploaderExtractor(Extractor[UploaderExtractorConfigClass]):
|
|
|
199
213
|
self.event_queue.__exit__(exc_type, exc_val, exc_tb)
|
|
200
214
|
self.raw_queue.__exit__(exc_type, exc_val, exc_tb)
|
|
201
215
|
self.time_series_queue.__exit__(exc_type, exc_val, exc_tb)
|
|
216
|
+
self.cdm_time_series_queue.__exit__(exc_type, exc_val, exc_tb)
|
|
202
217
|
return super().__exit__(exc_type, exc_val, exc_tb)
|
|
@@ -9,6 +9,7 @@ from typing import TypeAlias
|
|
|
9
9
|
|
|
10
10
|
from cognite.client.data_classes import Event as _Event
|
|
11
11
|
from cognite.client.data_classes import Row as _Row
|
|
12
|
+
from cognite.client.data_classes.data_modeling import NodeId
|
|
12
13
|
from cognite.extractorutils.uploader.time_series import DataPoint
|
|
13
14
|
|
|
14
15
|
|
|
@@ -17,18 +18,28 @@ class InsertDatapoints:
|
|
|
17
18
|
A class representing a batch of datapoints to be inserted into a time series.
|
|
18
19
|
"""
|
|
19
20
|
|
|
20
|
-
def __init__(self, *, id: int | None = None, external_id: str | None = None, datapoints: list[DataPoint]): # noqa: A002
|
|
21
|
+
def __init__(self, *, id: int | None = None, external_id: str | None = None, datapoints: list[DataPoint]) -> None: # noqa: A002
|
|
21
22
|
self.id = id
|
|
22
23
|
self.external_id = external_id
|
|
23
24
|
self.datapoints = datapoints
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
class InsertCDMDatapoints:
|
|
28
|
+
"""
|
|
29
|
+
A class representing a batch of datapoints to be inserted into a cdm time series.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, *, instance_id: NodeId, datapoints: list[DataPoint]) -> None:
|
|
33
|
+
self.instance_id = instance_id
|
|
34
|
+
self.datapoints = datapoints
|
|
35
|
+
|
|
36
|
+
|
|
26
37
|
class RawRow:
|
|
27
38
|
"""
|
|
28
39
|
A class representing a row of data to be inserted into a RAW table.
|
|
29
40
|
"""
|
|
30
41
|
|
|
31
|
-
def __init__(self, db_name: str, table_name: str, row: _Row | Iterable[_Row]):
|
|
42
|
+
def __init__(self, db_name: str, table_name: str, row: _Row | Iterable[_Row]) -> None:
|
|
32
43
|
self.db_name = db_name
|
|
33
44
|
self.table_name = table_name
|
|
34
45
|
if isinstance(row, Iterable):
|
cognite/extractorutils/util.py
CHANGED
|
@@ -30,12 +30,14 @@ from typing import Any, TypeVar
|
|
|
30
30
|
from decorator import decorator
|
|
31
31
|
|
|
32
32
|
from cognite.client import CogniteClient
|
|
33
|
+
from cognite.client._api.assets import AssetsAPI
|
|
34
|
+
from cognite.client._api.time_series import TimeSeriesAPI
|
|
33
35
|
from cognite.client.data_classes import Asset, ExtractionPipelineRun, TimeSeries
|
|
34
36
|
from cognite.client.exceptions import CogniteAPIError, CogniteException, CogniteFileUploadError, CogniteNotFoundError
|
|
35
37
|
from cognite.extractorutils.threading import CancellationToken
|
|
36
38
|
|
|
37
39
|
|
|
38
|
-
def _ensure(endpoint:
|
|
40
|
+
def _ensure(endpoint: TimeSeriesAPI | AssetsAPI, items: Iterable[Any]) -> None:
|
|
39
41
|
try:
|
|
40
42
|
external_ids = [ts.external_id for ts in items]
|
|
41
43
|
|
|
@@ -90,7 +92,7 @@ class EitherId:
|
|
|
90
92
|
TypeError: If none of both of id types are set.
|
|
91
93
|
"""
|
|
92
94
|
|
|
93
|
-
def __init__(self, **kwargs: int | str | None):
|
|
95
|
+
def __init__(self, **kwargs: int | str | None) -> None:
|
|
94
96
|
internal_id = kwargs.get("id")
|
|
95
97
|
external_id = kwargs.get("externalId") or kwargs.get("external_id")
|
|
96
98
|
|
|
@@ -127,7 +129,7 @@ class EitherId:
|
|
|
127
129
|
"""
|
|
128
130
|
return self.internal_id or self.external_id # type: ignore # checked to be not None in init
|
|
129
131
|
|
|
130
|
-
def __eq__(self, other:
|
|
132
|
+
def __eq__(self, other: object) -> bool:
|
|
131
133
|
"""
|
|
132
134
|
Compare with another object. Only returns true if other is an EitherId with the same type and content.
|
|
133
135
|
|
|
@@ -210,7 +212,7 @@ def add_extraction_pipeline(
|
|
|
210
212
|
|
|
211
213
|
def decorator_ext_pip(input_function: Callable[..., _T1]) -> Callable[..., _T1]:
|
|
212
214
|
@wraps(input_function)
|
|
213
|
-
def wrapper_ext_pip(*args: Any, **kwargs: Any) -> _T1:
|
|
215
|
+
def wrapper_ext_pip(*args: Any, **kwargs: Any) -> _T1: # noqa: ANN401
|
|
214
216
|
##############################
|
|
215
217
|
# Setup Extraction Pipelines #
|
|
216
218
|
##############################
|
|
@@ -397,7 +399,7 @@ def retry(
|
|
|
397
399
|
"""
|
|
398
400
|
|
|
399
401
|
@decorator
|
|
400
|
-
def retry_decorator(f: Callable[..., _T2], *fargs: Any, **fkwargs: Any) -> _T2:
|
|
402
|
+
def retry_decorator(f: Callable[..., _T2], *fargs: Any, **fkwargs: Any) -> _T2: # noqa: ANN401
|
|
401
403
|
args = fargs if fargs else []
|
|
402
404
|
kwargs = fkwargs if fkwargs else {}
|
|
403
405
|
|
|
@@ -657,7 +659,7 @@ def iterable_to_stream(
|
|
|
657
659
|
def readable(self) -> bool:
|
|
658
660
|
return True
|
|
659
661
|
|
|
660
|
-
def readinto(self, buffer:
|
|
662
|
+
def readinto(self, buffer: "WritableBuffer") -> int | None: # type: ignore[name-defined] # noqa: F821
|
|
661
663
|
try:
|
|
662
664
|
# Bytes to return
|
|
663
665
|
ln = len(buffer)
|
{cognite_extractor_utils-7.6.0.dist-info → cognite_extractor_utils-7.8.0.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cognite-extractor-utils
|
|
3
|
-
Version: 7.
|
|
3
|
+
Version: 7.8.0
|
|
4
4
|
Summary: Utilities for easier development of extractors for CDF
|
|
5
5
|
Project-URL: repository, https://github.com/cognitedata/python-extractor-utils
|
|
6
6
|
Author-email: Mathias Lohne <mathias.lohne@cognite.com>
|
|
@@ -12,7 +12,7 @@ Requires-Python: >=3.10
|
|
|
12
12
|
Requires-Dist: arrow>=1.0.0
|
|
13
13
|
Requires-Dist: azure-identity>=1.14.0
|
|
14
14
|
Requires-Dist: azure-keyvault-secrets>=4.7.0
|
|
15
|
-
Requires-Dist: cognite-sdk>=7.
|
|
15
|
+
Requires-Dist: cognite-sdk>=7.75.2
|
|
16
16
|
Requires-Dist: croniter>=6.0.0
|
|
17
17
|
Requires-Dist: dacite<1.9.0,>=1.6.0
|
|
18
18
|
Requires-Dist: decorator>=5.1.1
|
|
@@ -26,6 +26,7 @@ Requires-Dist: pydantic>=2.8.2
|
|
|
26
26
|
Requires-Dist: pyhumps>=3.8.0
|
|
27
27
|
Requires-Dist: python-dotenv>=1.0.0
|
|
28
28
|
Requires-Dist: pyyaml<7,>=5.3.0
|
|
29
|
+
Requires-Dist: simple-winservice>=0.1.0; sys_platform == 'win32'
|
|
29
30
|
Requires-Dist: typing-extensions<5,>=3.7.4
|
|
30
31
|
Provides-Extra: experimental
|
|
31
32
|
Requires-Dist: cognite-sdk-experimental; extra == 'experimental'
|