cognite-extractor-utils 7.4.9__py3-none-any.whl → 7.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-extractor-utils might be problematic. Click here for more details.

@@ -16,5 +16,5 @@
16
16
  Cognite extractor utils is a Python package that simplifies the development of new extractors.
17
17
  """
18
18
 
19
- __version__ = "7.4.9"
19
+ __version__ = "7.5.1"
20
20
  from .base import Extractor
@@ -0,0 +1,34 @@
1
+ """
2
+ Temporary holding place for DTOs against Extraction Pipelines 2.0 until it's in the SDK
3
+ """
4
+
5
+ from typing import Any, Literal
6
+
7
+ from humps import camelize
8
+ from pydantic import BaseModel, ConfigDict
9
+
10
+
11
+ class CogniteModel(BaseModel):
12
+ """
13
+ Base class for DTO classes based on pydantic, but with a few tweaks to make it inline with the CDF API guidelines:
14
+ * camelCase instead of snake_case when serializing/deserializing into/from JSON
15
+ * exclude Nones from serialized JSON instead of having nulls in the response text
16
+ """
17
+
18
+ def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
19
+ if kwargs:
20
+ kwargs["exclude_none"] = True
21
+ else:
22
+ kwargs = {"exclude_none": True}
23
+ return BaseModel.model_dump(self, *args, **kwargs)
24
+
25
+ def dict(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
26
+ return self.model_dump(*args, **kwargs)
27
+
28
+ model_config = ConfigDict(alias_generator=camelize, populate_by_name=True, extra="forbid")
29
+
30
+
31
+ class TaskUpdate(CogniteModel):
32
+ type: Literal["started"] | Literal["ended"]
33
+ name: str
34
+ timestamp: int
@@ -1,14 +1,20 @@
1
1
  import logging
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
  from multiprocessing import Queue
3
4
  from threading import RLock, Thread
4
5
  from types import TracebackType
5
6
  from typing import Generic, Literal, Optional, Type, TypeVar, Union
6
7
 
7
- from typing_extensions import Self
8
+ from humps import pascalize
9
+ from typing_extensions import Self, assert_never
8
10
 
9
11
  from cognite.extractorutils.threading import CancellationToken
10
12
  from cognite.extractorutils.unstable.configuration.models import ConnectionConfig, ExtractorConfig
13
+ from cognite.extractorutils.unstable.core._dto import TaskUpdate
11
14
  from cognite.extractorutils.unstable.core._messaging import RuntimeMessage
15
+ from cognite.extractorutils.unstable.core.tasks import ContinuousTask, ScheduledTask, StartupTask, Task
16
+ from cognite.extractorutils.unstable.scheduling import TaskScheduler
17
+ from cognite.extractorutils.util import now
12
18
 
13
19
  ConfigType = TypeVar("ConfigType", bound=ExtractorConfig)
14
20
  ConfigRevision = Union[Literal["local"], int]
@@ -40,29 +46,39 @@ class Extractor(Generic[ConfigType]):
40
46
  self._checkin_lock = RLock()
41
47
  self._runtime_messages: Optional[Queue[RuntimeMessage]] = None
42
48
 
49
+ self._scheduler = TaskScheduler(self.cancellation_token.create_child_token())
50
+
51
+ self._tasks: list[Task] = []
52
+ self._task_updates: list[TaskUpdate] = []
53
+
43
54
  self.logger = logging.getLogger(f"{self.EXTERNAL_ID}.main")
44
55
 
45
56
  def _set_runtime_message_queue(self, queue: Queue) -> None:
46
57
  self._runtime_messages = queue
47
58
 
48
- def _run_checkin(self) -> None:
49
- def checkin() -> None:
50
- body = {"externalId": self.connection_config.extraction_pipeline}
59
+ def _checkin(self) -> None:
60
+ with self._checkin_lock:
61
+ task_updates = [t.model_dump() for t in self._task_updates]
62
+ self._task_updates.clear()
51
63
 
52
- with self._checkin_lock:
53
- res = self.cognite_client.post(
54
- f"/api/v1/projects/{self.cognite_client.config.project}/odin/checkin",
55
- json=body,
56
- headers={"cdf-version": "alpha"},
57
- )
58
- new_config_revision = res.json().get("lastConfigRevision")
64
+ res = self.cognite_client.post(
65
+ f"/api/v1/projects/{self.cognite_client.config.project}/odin/checkin",
66
+ json={
67
+ "externalId": self.connection_config.extraction_pipeline,
68
+ "taskEvents": task_updates,
69
+ },
70
+ headers={"cdf-version": "alpha"},
71
+ )
72
+ new_config_revision = res.json().get("lastConfigRevision")
59
73
 
60
- if new_config_revision and new_config_revision != self.current_config_revision:
61
- self.restart()
74
+ if new_config_revision and new_config_revision != self.current_config_revision:
75
+ self.restart()
62
76
 
77
+ def _run_checkin(self) -> None:
63
78
  while not self.cancellation_token.is_cancelled:
64
79
  try:
65
- checkin()
80
+ self.logger.debug("Running checkin")
81
+ self._checkin()
66
82
  except Exception:
67
83
  self.logger.exception("Error during checkin")
68
84
  self.cancellation_token.wait(10)
@@ -81,7 +97,32 @@ class Extractor(Generic[ConfigType]):
81
97
  ) -> Self:
82
98
  return cls(connection_config, application_config, current_config_revision)
83
99
 
84
- def start(self) -> None:
100
+ def add_task(self, task: Task) -> None:
101
+ target = task.target
102
+
103
+ def wrapped() -> None:
104
+ with self._checkin_lock:
105
+ self._task_updates.append(
106
+ TaskUpdate(type="started", name=task.name, timestamp=now()),
107
+ )
108
+
109
+ try:
110
+ target()
111
+
112
+ finally:
113
+ with self._checkin_lock:
114
+ self._task_updates.append(
115
+ TaskUpdate(type="ended", name=task.name, timestamp=now()),
116
+ )
117
+
118
+ task.target = wrapped
119
+ self._tasks.append(task)
120
+
121
+ match task:
122
+ case ScheduledTask() as t:
123
+ self._scheduler.schedule_task(name=t.name, schedule=t.schedule, task=t.target)
124
+
125
+ def _report_extractor_info(self) -> None:
85
126
  self.cognite_client.post(
86
127
  f"/api/v1/projects/{self.cognite_client.config.project}/odin/extractorinfo",
87
128
  json={
@@ -91,9 +132,19 @@ class Extractor(Generic[ConfigType]):
91
132
  "version": self.VERSION,
92
133
  "externalId": self.EXTERNAL_ID,
93
134
  },
135
+ "tasks": [
136
+ {
137
+ "name": t.name,
138
+ "type": "continuous" if isinstance(t, ContinuousTask) else "batch",
139
+ }
140
+ for t in self._tasks
141
+ ],
94
142
  },
95
143
  headers={"cdf-version": "alpha"},
96
144
  )
145
+
146
+ def start(self) -> None:
147
+ self._report_extractor_info()
97
148
  Thread(target=self._run_checkin, name="ExtractorCheckin", daemon=True).start()
98
149
 
99
150
  def stop(self) -> None:
@@ -110,7 +161,43 @@ class Extractor(Generic[ConfigType]):
110
161
  exc_tb: Optional[TracebackType],
111
162
  ) -> bool:
112
163
  self.stop()
164
+ with self._checkin_lock:
165
+ self._checkin()
166
+
113
167
  return exc_val is None
114
168
 
115
169
  def run(self) -> None:
116
- raise NotImplementedError()
170
+ has_scheduled = False
171
+
172
+ startup: list[StartupTask] = []
173
+ continuous: list[ContinuousTask] = []
174
+
175
+ for task in self._tasks:
176
+ match task:
177
+ case ScheduledTask():
178
+ has_scheduled = True
179
+
180
+ case StartupTask() as t:
181
+ startup.append(t)
182
+
183
+ case ContinuousTask() as t:
184
+ continuous.append(t)
185
+
186
+ case _:
187
+ assert_never(task)
188
+
189
+ self.logger.info("Starting up extractor")
190
+ if startup:
191
+ with ThreadPoolExecutor() as pool:
192
+ for task in startup:
193
+ pool.submit(task.target)
194
+ self.logger.info("Startup done")
195
+
196
+ for task in continuous:
197
+ Thread(name=pascalize(task.name), target=task.target).start()
198
+
199
+ if has_scheduled:
200
+ self._scheduler.run()
201
+
202
+ else:
203
+ self.cancellation_token.wait()
@@ -0,0 +1,29 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from typing import Callable
4
+
5
+ from cognite.extractorutils.unstable.configuration.models import ScheduleConfig
6
+
7
+
8
+ @dataclass
9
+ class _Task(ABC):
10
+ name: str
11
+ target: Callable[[], None]
12
+
13
+
14
+ @dataclass
15
+ class ScheduledTask(_Task):
16
+ schedule: ScheduleConfig
17
+
18
+
19
+ @dataclass
20
+ class ContinuousTask(_Task):
21
+ pass
22
+
23
+
24
+ class StartupTask(_Task):
25
+ pass
26
+
27
+
28
+ # Making a type union to help with exhaustion checks in matches
29
+ Task = ScheduledTask | ContinuousTask | StartupTask
@@ -42,6 +42,8 @@ class TaskScheduler:
42
42
  parsed_schedule = IntervalSchedule(interval=interval_config.expression.seconds)
43
43
 
44
44
  with self._jobs_lock:
45
+ if name in self._jobs:
46
+ raise KeyError(f"Job '{name}' is already added to the scheduler")
45
47
  self._jobs[name] = Job(name=name, call=task, schedule=parsed_schedule)
46
48
 
47
49
  def _get_next(self) -> list[Job]:
@@ -230,7 +230,7 @@ class IOFileUploadQueue(AbstractUploadQueue):
230
230
 
231
231
  self._full_queue = threading.Condition()
232
232
 
233
- self._httpx_client = Client(follow_redirects=True)
233
+ self._httpx_client = Client(follow_redirects=True, timeout=cdf_client.config.file_transfer_timeout)
234
234
 
235
235
  global _QUEUES, _QUEUES_LOCK
236
236
  with _QUEUES_LOCK:
@@ -17,10 +17,12 @@ The ``util`` package contains miscellaneous functions and classes that can some
17
17
  extractors.
18
18
  """
19
19
 
20
+ import io
20
21
  import logging
21
22
  import random
22
23
  from datetime import datetime, timezone
23
24
  from functools import partial, wraps
25
+ from io import RawIOBase
24
26
  from threading import Thread
25
27
  from time import time
26
28
  from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Tuple, Type, TypeVar, Union
@@ -510,3 +512,113 @@ def datetime_to_timestamp(dt: datetime) -> int:
510
512
 
511
513
  def timestamp_to_datetime(ts: int) -> datetime:
512
514
  return datetime.fromtimestamp(ts / 1000, tz=timezone.utc)
515
+
516
+
517
+ def now() -> int:
518
+ """
519
+ Current time in CDF format (milliseonds since 1970-01-01 00:00:00 UTC)
520
+ """
521
+ return int(time() * 1000)
522
+
523
+
524
+ def truncate_byte_len(item: str, ln: int) -> str:
525
+ """Safely truncate an arbitrary utf-8 string.
526
+ Used to sanitize metadata.
527
+
528
+ Args:
529
+ item (str): string to be truncated
530
+ ln (int): length (bytes)
531
+
532
+ Returns:
533
+ str: truncated string
534
+ """
535
+
536
+ bts = item.encode("utf-8")
537
+ if len(bts) <= ln:
538
+ return item
539
+ bts = bts[:ln]
540
+ last_codepoint_index = len(bts) - 1
541
+ # Find the last byte that's the start of an UTF-8 codepoint
542
+ while last_codepoint_index > 0 and (bts[last_codepoint_index] & 0b11000000) == 0b10000000:
543
+ last_codepoint_index -= 1
544
+
545
+ last_codepoint_start = bts[last_codepoint_index]
546
+ last_codepoint_len = 0
547
+ if last_codepoint_start & 0b11111000 == 0b11110000:
548
+ last_codepoint_len = 4
549
+ elif last_codepoint_start & 0b11110000 == 0b11100000:
550
+ last_codepoint_len = 3
551
+ elif last_codepoint_start & 0b11100000 == 0b11000000:
552
+ last_codepoint_len = 2
553
+ elif last_codepoint_start & 0b10000000 == 0:
554
+ last_codepoint_len = 1
555
+ else:
556
+ if last_codepoint_index - 2 <= 0:
557
+ return ""
558
+ # Somehow a longer codepoint? In this case just use the previous codepoint.
559
+ return bts[: (last_codepoint_index - 2)].decode("utf-8")
560
+
561
+ last_codepoint_end_index = last_codepoint_index + last_codepoint_len - 1
562
+ if last_codepoint_end_index > ln - 1:
563
+ if last_codepoint_index - 2 <= 0:
564
+ return ""
565
+ # We're in the middle of a codepoint, cut to the previous one
566
+ return bts[:last_codepoint_index].decode("utf-8")
567
+ else:
568
+ return bts.decode("utf-8")
569
+
570
+
571
+ class BufferedReadWithLength(io.BufferedReader):
572
+ def __init__(
573
+ self, raw: RawIOBase, buffer_size: int, len: int, on_close: Optional[Callable[[], None]] = None
574
+ ) -> None:
575
+ super().__init__(raw, buffer_size)
576
+ # Do not remove even if it appears to be unused. :P
577
+ # Requests uses this to add the content-length header, which is necessary for writing to files in azure clusters
578
+ self.len = len
579
+ self.on_close = on_close
580
+
581
+ def close(self) -> None:
582
+ if self.on_close:
583
+ self.on_close()
584
+ return super().close()
585
+
586
+
587
+ def iterable_to_stream(
588
+ iterator: Iterable[bytes],
589
+ file_size_bytes: int,
590
+ buffer_size: int = io.DEFAULT_BUFFER_SIZE,
591
+ on_close: Optional[Callable[[], None]] = None,
592
+ ) -> BufferedReadWithLength:
593
+ class ChunkIteratorStream(io.RawIOBase):
594
+ def __init__(self) -> None:
595
+ self.last_chunk = None
596
+ self.loaded_bytes = 0
597
+ self.file_size_bytes = file_size_bytes
598
+
599
+ def tell(self) -> int:
600
+ return self.loaded_bytes
601
+
602
+ def __len__(self) -> int:
603
+ return self.file_size_bytes
604
+
605
+ def readable(self) -> bool:
606
+ return True
607
+
608
+ def readinto(self, buffer: Any) -> int | None:
609
+ try:
610
+ # Bytes to return
611
+ ln = len(buffer)
612
+ chunk = self.last_chunk or next(iterator) # type: ignore
613
+ output, self.last_chunk = chunk[:ln], chunk[ln:]
614
+ if len(self.last_chunk) == 0: # type: ignore
615
+ self.last_chunk = None
616
+ buffer[: len(output)] = output
617
+ self.loaded_bytes += len(output)
618
+ return len(output)
619
+ except StopIteration:
620
+ return 0
621
+
622
+ return BufferedReadWithLength(
623
+ ChunkIteratorStream(), buffer_size=buffer_size, len=file_size_bytes, on_close=on_close
624
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cognite-extractor-utils
3
- Version: 7.4.9
3
+ Version: 7.5.1
4
4
  Summary: Utilities for easier development of extractors for CDF
5
5
  Home-page: https://github.com/cognitedata/python-extractor-utils
6
6
  License: Apache-2.0
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
13
13
  Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
16
17
  Provides-Extra: experimental
17
18
  Requires-Dist: arrow (>=1.0.0,<2.0.0)
18
19
  Requires-Dist: azure-identity (>=1.14.0,<2.0.0)
@@ -1,4 +1,4 @@
1
- cognite/extractorutils/__init__.py,sha256=Cxd38BeR0YorUxKXaS9dz_FKoKa2trhrZJFhiHh0EuY,739
1
+ cognite/extractorutils/__init__.py,sha256=UC4u-ugttsnuWjP4G_NqFqLVcfMbKVb68mowVYvJXXE,739
2
2
  cognite/extractorutils/_inner_util.py,sha256=gmz6aqS7jDNsg8z4RHgJjMFohDLOMiaU4gMWBhg3xcE,1558
3
3
  cognite/extractorutils/base.py,sha256=q6NU2bPec3WOasVnnIFoh-aUJudVZWZ2R6emz3IRj8Q,16391
4
4
  cognite/extractorutils/configtools/__init__.py,sha256=YEpFGJoza23eM8Zj5DqqUj7sEstERV_QYsN6Nw4dKCg,3092
@@ -20,11 +20,13 @@ cognite/extractorutils/unstable/configuration/loaders.py,sha256=9TqVLKGiFl7L-6SL
20
20
  cognite/extractorutils/unstable/configuration/models.py,sha256=fPu56TkFKMRr5uc5R3X4WlHNDofvyH0JZfORvwwDT44,7727
21
21
  cognite/extractorutils/unstable/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  cognite/extractorutils/unstable/core/__main__.py,sha256=8Tb0RqeRBs47dU5xOikZog-IM2IKVonj9dnPG7Ia710,728
23
+ cognite/extractorutils/unstable/core/_dto.py,sha256=OP_cWZ2EvRbq3Tcczc-NWbYsxPZJ0HPgnKyZw-OAZ-I,1117
23
24
  cognite/extractorutils/unstable/core/_messaging.py,sha256=D9rOW8fijryXffbm90d8VTf2vy5FmwVGU-H0O-cn-EI,68
24
- cognite/extractorutils/unstable/core/base.py,sha256=y8KYppvC-9EhdwxZ4XWhYY4VRYGVS1jDs7x_QqVpS_4,3955
25
+ cognite/extractorutils/unstable/core/base.py,sha256=nvvMc8ddi742M-_CIzsHQSlUswMyn4uAX3ceHp3Db_k,6745
25
26
  cognite/extractorutils/unstable/core/runtime.py,sha256=sRrPsICawgEHFU9DYArDpIMxebudR0BZ5K3h6HAW9As,6054
27
+ cognite/extractorutils/unstable/core/tasks.py,sha256=JjzfQDpCmNZxWnSJxmhrK3TvxNbmzRxEjgjHJQQO__c,515
26
28
  cognite/extractorutils/unstable/scheduling/__init__.py,sha256=L90_rCZNHvti-PInne0r7W9edIkifctELjiaxEoQiSc,67
27
- cognite/extractorutils/unstable/scheduling/_scheduler.py,sha256=6Qui9v0BcSq-wJONxeqPs6W5SS_5paGW8nvzA3LESVA,3574
29
+ cognite/extractorutils/unstable/scheduling/_scheduler.py,sha256=w2Hs1u3-cNjxrZHtoNFvCmLCd0GNU52K4uUd-Yo_RrM,3691
28
30
  cognite/extractorutils/unstable/scheduling/_schedules.py,sha256=y0NVeXYZOFcAyzBgAe8jqK0W-SZL5m99UwXAacGzqIw,677
29
31
  cognite/extractorutils/uploader/__init__.py,sha256=W22u6QHA4cR0j78LN5LTL5YGbfC-uTApagTyP5ab7uQ,3110
30
32
  cognite/extractorutils/uploader/_base.py,sha256=wktbV8dpb8zBOsNaECZkBNoJSpOz437NlNMER3-a3xQ,5304
@@ -32,13 +34,13 @@ cognite/extractorutils/uploader/_metrics.py,sha256=J2LJXb19L_SLSJ_voNIQHYLp0pjxU
32
34
  cognite/extractorutils/uploader/assets.py,sha256=2E90N1kxsaA6Ah4h0_r_dTVhDYY_68ItRWrHYkkltJw,5628
33
35
  cognite/extractorutils/uploader/data_modeling.py,sha256=w35Ix5mu0Cgfn4ywnDyif4VVjo04LVTlkMEevk6ztUs,3639
34
36
  cognite/extractorutils/uploader/events.py,sha256=NZP2tMoU_rh_rb-EZiUBsOT5KdNABHN4c9Oddk0OsdE,5680
35
- cognite/extractorutils/uploader/files.py,sha256=hrm3pcLGietJAoLuGWKKBaOlE1rnDlf1NgwvtNmCjwQ,22532
37
+ cognite/extractorutils/uploader/files.py,sha256=9yLtqStAQuFj33Fi12zo5HJYo-HdVYop8tkAHsmhHXg,22581
36
38
  cognite/extractorutils/uploader/raw.py,sha256=wFjF90PFTjmByOWx_Y4_YfDJ2w2jl0EQJ2Tjx2MP2PM,6738
37
39
  cognite/extractorutils/uploader/time_series.py,sha256=HBtQdsQoIOaL-EG5lMsaY-ORwVb0kGiXG86VjE5-_Bg,26815
38
40
  cognite/extractorutils/uploader_extractor.py,sha256=E-mpVvbPg_Tk90U4S9JybV0duptJ2SXE88HB6npE3zI,7732
39
41
  cognite/extractorutils/uploader_types.py,sha256=wxfrsiKPTzG5lmoYtQsxt8Xyj-s5HnaLl8WDzJNrazg,1020
40
- cognite/extractorutils/util.py,sha256=T6ef5b7aYJ8yq9swQwybYaLe3YGr3hElsJQy8E-d5Rs,17469
41
- cognite_extractor_utils-7.4.9.dist-info/LICENSE,sha256=psuoW8kuDP96RQsdhzwOqi6fyWv0ct8CR6Jr7He_P_k,10173
42
- cognite_extractor_utils-7.4.9.dist-info/METADATA,sha256=I5rq1WJzAnrchQKwS75G30Xx_qZkJ-ZDYP9w_8RZ2jw,5598
43
- cognite_extractor_utils-7.4.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
44
- cognite_extractor_utils-7.4.9.dist-info/RECORD,,
42
+ cognite/extractorutils/util.py,sha256=cFrbO8XCFT7QBs2vvOk4dWyOvaHclvsuPlrDJYFG0xA,21160
43
+ cognite_extractor_utils-7.5.1.dist-info/LICENSE,sha256=psuoW8kuDP96RQsdhzwOqi6fyWv0ct8CR6Jr7He_P_k,10173
44
+ cognite_extractor_utils-7.5.1.dist-info/METADATA,sha256=9CbjaMli40Sd2X88ksrr7gx0be_bilWAY4RtOtHLa14,5649
45
+ cognite_extractor_utils-7.5.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
46
+ cognite_extractor_utils-7.5.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 1.9.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any