cognite-extractor-utils 7.4.7__tar.gz → 7.4.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-extractor-utils might be problematic. Click here for more details.

Files changed (44) hide show
  1. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/PKG-INFO +2 -1
  2. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/__init__.py +1 -1
  3. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/configtools/_util.py +4 -2
  4. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/unstable/configuration/loaders.py +3 -3
  5. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/unstable/configuration/models.py +82 -2
  6. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/__init__.py +0 -0
  7. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/__main__.py +31 -0
  8. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/_messaging.py +5 -0
  9. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/base.py +116 -0
  10. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/runtime.py +171 -0
  11. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/scheduling/__init__.py +3 -0
  12. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/scheduling/_scheduler.py +102 -0
  13. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/scheduling/_schedules.py +31 -0
  14. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/files.py +13 -2
  15. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/pyproject.toml +3 -2
  16. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/LICENSE +0 -0
  17. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/README.md +0 -0
  18. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/_inner_util.py +0 -0
  19. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/base.py +0 -0
  20. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/configtools/__init__.py +0 -0
  21. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/configtools/elements.py +0 -0
  22. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/configtools/loaders.py +0 -0
  23. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/configtools/validators.py +0 -0
  24. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/exceptions.py +0 -0
  25. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/metrics.py +0 -0
  26. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/py.typed +0 -0
  27. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/statestore/__init__.py +0 -0
  28. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/statestore/_base.py +0 -0
  29. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/statestore/hashing.py +0 -0
  30. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/statestore/watermark.py +0 -0
  31. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/threading.py +0 -0
  32. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/unstable/__init__.py +0 -0
  33. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/unstable/configuration/__init__.py +0 -0
  34. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/__init__.py +0 -0
  35. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/_base.py +0 -0
  36. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/_metrics.py +0 -0
  37. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/assets.py +0 -0
  38. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/data_modeling.py +0 -0
  39. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/events.py +0 -0
  40. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/raw.py +0 -0
  41. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/time_series.py +0 -0
  42. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader_extractor.py +0 -0
  43. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader_types.py +0 -0
  44. {cognite_extractor_utils-7.4.7 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cognite-extractor-utils
3
- Version: 7.4.7
3
+ Version: 7.4.8
4
4
  Summary: Utilities for easier development of extractors for CDF
5
5
  Home-page: https://github.com/cognitedata/python-extractor-utils
6
6
  License: Apache-2.0
@@ -18,6 +18,7 @@ Requires-Dist: arrow (>=1.0.0,<2.0.0)
18
18
  Requires-Dist: azure-identity (>=1.14.0,<2.0.0)
19
19
  Requires-Dist: azure-keyvault-secrets (>=4.7.0,<5.0.0)
20
20
  Requires-Dist: cognite-sdk (>=7.59.0,<8.0.0)
21
+ Requires-Dist: croniter (>=3.0.3,<4.0.0)
21
22
  Requires-Dist: dacite (>=1.6.0,<2.0.0)
22
23
  Requires-Dist: decorator (>=5.1.1,<6.0.0)
23
24
  Requires-Dist: httpx (>=0.27.0,<0.28.0)
@@ -16,5 +16,5 @@
16
16
  Cognite extractor utils is a Python package that simplifies the development of new extractors.
17
17
  """
18
18
 
19
- __version__ = "7.4.7"
19
+ __version__ = "7.4.8"
20
20
  from .base import Extractor
@@ -81,8 +81,10 @@ def _to_snake_case(dictionary: Dict[str, Any], case_style: str) -> Dict[str, Any
81
81
  raise ValueError(f"Invalid case style: {case_style}")
82
82
 
83
83
 
84
- def _load_certificate_data(cert_path: str, password: Optional[str]) -> Union[Tuple[str, str], Tuple[bytes, bytes]]:
85
- path = Path(cert_path)
84
+ def _load_certificate_data(
85
+ cert_path: str | Path, password: Optional[str]
86
+ ) -> Union[Tuple[str, str], Tuple[bytes, bytes]]:
87
+ path = Path(cert_path) if isinstance(cert_path, str) else cert_path
86
88
  cert_data = Path(path).read_bytes()
87
89
 
88
90
  if path.suffix == ".pem":
@@ -2,7 +2,7 @@ import json
2
2
  from enum import Enum
3
3
  from io import StringIO
4
4
  from pathlib import Path
5
- from typing import Dict, Optional, TextIO, Type, TypeVar, Union
5
+ from typing import Dict, Optional, TextIO, Tuple, Type, TypeVar, Union
6
6
 
7
7
  from pydantic import ValidationError
8
8
 
@@ -33,7 +33,7 @@ def load_file(path: Path, schema: Type[_T]) -> _T:
33
33
 
34
34
  def load_from_cdf(
35
35
  cognite_client: CogniteClient, external_id: str, schema: Type[_T], revision: Optional[int] = None
36
- ) -> _T:
36
+ ) -> Tuple[_T, int]:
37
37
  params: Dict[str, Union[str, int]] = {"externalId": external_id}
38
38
  if revision:
39
39
  params["revision"] = revision
@@ -44,7 +44,7 @@ def load_from_cdf(
44
44
  )
45
45
  response.raise_for_status()
46
46
  data = response.json()
47
- return load_io(StringIO(data["config"]), ConfigFormat.YAML, schema)
47
+ return load_io(StringIO(data["config"]), ConfigFormat.YAML, schema), data["revision"]
48
48
 
49
49
 
50
50
  def load_io(stream: TextIO, format: ConfigFormat, schema: Type[_T]) -> _T:
@@ -7,7 +7,16 @@ from typing import Annotated, Any, Dict, List, Literal, Optional, Union
7
7
  from humps import kebabize
8
8
  from pydantic import BaseModel, ConfigDict, Field, GetCoreSchemaHandler
9
9
  from pydantic_core import CoreSchema, core_schema
10
-
10
+ from typing_extensions import assert_never
11
+
12
+ from cognite.client import CogniteClient
13
+ from cognite.client.config import ClientConfig
14
+ from cognite.client.credentials import (
15
+ CredentialProvider,
16
+ OAuthClientCertificate,
17
+ OAuthClientCredentials,
18
+ )
19
+ from cognite.extractorutils.configtools._util import _load_certificate_data
11
20
  from cognite.extractorutils.exceptions import InvalidConfigError
12
21
 
13
22
 
@@ -33,7 +42,9 @@ class _ClientCredentialsConfig(ConfigModel):
33
42
  class _ClientCertificateConfig(ConfigModel):
34
43
  type: Literal["client-certificate"]
35
44
  client_id: str
36
- certificate_path: Path
45
+ path: Path
46
+ password: Optional[str] = None
47
+ authority_url: str
37
48
  scopes: List[str]
38
49
 
39
50
 
@@ -121,6 +132,7 @@ class _ConnectionParameters(ConfigModel):
121
132
  max_connection_pool_size: int = 50
122
133
  ssl_verify: bool = True
123
134
  proxies: Dict[str, str] = Field(default_factory=dict)
135
+ timeout: TimeIntervalConfig = Field(default_factory=lambda: TimeIntervalConfig("30s"))
124
136
 
125
137
 
126
138
  class ConnectionConfig(ConfigModel):
@@ -133,6 +145,74 @@ class ConnectionConfig(ConfigModel):
133
145
 
134
146
  connection: _ConnectionParameters = Field(default_factory=_ConnectionParameters)
135
147
 
148
+ def get_cognite_client(self, client_name: str) -> CogniteClient:
149
+ from cognite.client.config import global_config
150
+
151
+ global_config.disable_pypi_version_check = True
152
+ global_config.disable_gzip = not self.connection.gzip_compression
153
+ global_config.status_forcelist = set(self.connection.status_forcelist)
154
+ global_config.max_retries = self.connection.max_retries
155
+ global_config.max_retries_connect = self.connection.max_retries_connect
156
+ global_config.max_retry_backoff = self.connection.max_retry_backoff.seconds
157
+ global_config.max_connection_pool_size = self.connection.max_connection_pool_size
158
+ global_config.disable_ssl = not self.connection.ssl_verify
159
+ global_config.proxies = self.connection.proxies
160
+
161
+ credential_provider: CredentialProvider
162
+ match self.authentication:
163
+ case _ClientCredentialsConfig() as client_credentials:
164
+ kwargs = {
165
+ "token_url": client_credentials.token_url,
166
+ "client_id": client_credentials.client_id,
167
+ "client_secret": client_credentials.client_secret,
168
+ "scopes": client_credentials.scopes,
169
+ }
170
+ if client_credentials.audience is not None:
171
+ kwargs["audience"] = client_credentials.audience
172
+ if client_credentials.resource is not None:
173
+ kwargs["resource"] = client_credentials.resource
174
+
175
+ credential_provider = OAuthClientCredentials(**kwargs) # type: ignore # I know what I'm doing
176
+
177
+ case _ClientCertificateConfig() as client_certificate:
178
+ thumbprint, key = _load_certificate_data(
179
+ client_certificate.path,
180
+ client_certificate.password,
181
+ )
182
+ credential_provider = OAuthClientCertificate(
183
+ authority_url=client_certificate.authority_url,
184
+ client_id=client_certificate.client_id,
185
+ cert_thumbprint=str(thumbprint),
186
+ certificate=str(key),
187
+ scopes=client_certificate.scopes,
188
+ )
189
+
190
+ case _:
191
+ assert_never(self.authentication)
192
+
193
+ client_config = ClientConfig(
194
+ project=self.project,
195
+ base_url=self.base_url,
196
+ client_name=client_name,
197
+ timeout=self.connection.timeout.seconds,
198
+ credentials=credential_provider,
199
+ )
200
+
201
+ return CogniteClient(client_config)
202
+
203
+
204
+ class CronConfig(ConfigModel):
205
+ type: Literal["cron"]
206
+ expression: str
207
+
208
+
209
+ class IntervalConfig(ConfigModel):
210
+ type: Literal["interval"]
211
+ expression: TimeIntervalConfig
212
+
213
+
214
+ ScheduleConfig = Annotated[CronConfig | IntervalConfig, Field(discriminator="type")]
215
+
136
216
 
137
217
  class LogLevel(Enum):
138
218
  CRITICAL = "CRITICAL"
@@ -0,0 +1,31 @@
1
+ """
2
+ Example of how you would build an extractor with the new base class
3
+ """
4
+
5
+ from cognite.extractorutils.unstable.configuration.models import ExtractorConfig
6
+
7
+ from .base import Extractor
8
+ from .runtime import Runtime
9
+
10
+
11
+ class MyConfig(ExtractorConfig):
12
+ parameter_one: int
13
+ parameter_two: str
14
+
15
+
16
+ class MyExtractor(Extractor[MyConfig]):
17
+ NAME = "Test extractor"
18
+ EXTERNAL_ID = "test-extractor"
19
+ DESCRIPTION = "Test of the new runtime"
20
+ VERSION = "1.0.0"
21
+ CONFIG_TYPE = MyConfig
22
+
23
+ def run(self) -> None:
24
+ self.logger.info("Started!")
25
+ if not self.cancellation_token.wait(10):
26
+ raise ValueError("Oops")
27
+
28
+
29
+ if __name__ == "__main__":
30
+ runtime = Runtime(MyExtractor)
31
+ runtime.run()
@@ -0,0 +1,5 @@
1
+ from enum import Enum
2
+
3
+
4
+ class RuntimeMessage(Enum):
5
+ RESTART = 1
@@ -0,0 +1,116 @@
1
+ import logging
2
+ from multiprocessing import Queue
3
+ from threading import RLock, Thread
4
+ from types import TracebackType
5
+ from typing import Generic, Literal, Optional, Type, TypeVar, Union
6
+
7
+ from typing_extensions import Self
8
+
9
+ from cognite.extractorutils.threading import CancellationToken
10
+ from cognite.extractorutils.unstable.configuration.models import ConnectionConfig, ExtractorConfig
11
+ from cognite.extractorutils.unstable.core._messaging import RuntimeMessage
12
+
13
+ ConfigType = TypeVar("ConfigType", bound=ExtractorConfig)
14
+ ConfigRevision = Union[Literal["local"], int]
15
+
16
+
17
+ class Extractor(Generic[ConfigType]):
18
+ NAME: str
19
+ EXTERNAL_ID: str
20
+ DESCRIPTION: str
21
+ VERSION: str
22
+
23
+ CONFIG_TYPE: Type[ConfigType]
24
+
25
+ def __init__(
26
+ self,
27
+ connection_config: ConnectionConfig,
28
+ application_config: ConfigType,
29
+ current_config_revision: ConfigRevision,
30
+ ) -> None:
31
+ self.cancellation_token = CancellationToken()
32
+ self.cancellation_token.cancel_on_interrupt()
33
+
34
+ self.connection_config = connection_config
35
+ self.application_config = application_config
36
+ self.current_config_revision = current_config_revision
37
+
38
+ self.cognite_client = self.connection_config.get_cognite_client(f"{self.EXTERNAL_ID}-{self.VERSION}")
39
+
40
+ self._checkin_lock = RLock()
41
+ self._runtime_messages: Optional[Queue[RuntimeMessage]] = None
42
+
43
+ self.logger = logging.getLogger(f"{self.EXTERNAL_ID}.main")
44
+
45
+ def _set_runtime_message_queue(self, queue: Queue) -> None:
46
+ self._runtime_messages = queue
47
+
48
+ def _run_checkin(self) -> None:
49
+ def checkin() -> None:
50
+ body = {"externalId": self.connection_config.extraction_pipeline}
51
+
52
+ with self._checkin_lock:
53
+ res = self.cognite_client.post(
54
+ f"/api/v1/projects/{self.cognite_client.config.project}/odin/checkin",
55
+ json=body,
56
+ headers={"cdf-version": "alpha"},
57
+ )
58
+ new_config_revision = res.json().get("lastConfigRevision")
59
+
60
+ if new_config_revision and new_config_revision != self.current_config_revision:
61
+ self.restart()
62
+
63
+ while not self.cancellation_token.is_cancelled:
64
+ try:
65
+ checkin()
66
+ except Exception:
67
+ self.logger.exception("Error during checkin")
68
+ self.cancellation_token.wait(10)
69
+
70
+ def restart(self) -> None:
71
+ if self._runtime_messages:
72
+ self._runtime_messages.put(RuntimeMessage.RESTART)
73
+ self.cancellation_token.cancel()
74
+
75
+ @classmethod
76
+ def init_from_runtime(
77
+ cls,
78
+ connection_config: ConnectionConfig,
79
+ application_config: ConfigType,
80
+ current_config_revision: ConfigRevision,
81
+ ) -> Self:
82
+ return cls(connection_config, application_config, current_config_revision)
83
+
84
+ def start(self) -> None:
85
+ self.cognite_client.post(
86
+ f"/api/v1/projects/{self.cognite_client.config.project}/odin/extractorinfo",
87
+ json={
88
+ "externalId": self.connection_config.extraction_pipeline,
89
+ "activeConfigRevision": self.current_config_revision,
90
+ "extractor": {
91
+ "version": self.VERSION,
92
+ "externalId": self.EXTERNAL_ID,
93
+ },
94
+ },
95
+ headers={"cdf-version": "alpha"},
96
+ )
97
+ Thread(target=self._run_checkin, name="ExtractorCheckin", daemon=True).start()
98
+
99
+ def stop(self) -> None:
100
+ self.cancellation_token.cancel()
101
+
102
+ def __enter__(self) -> Self:
103
+ self.start()
104
+ return self
105
+
106
+ def __exit__(
107
+ self,
108
+ exc_type: Optional[Type[BaseException]],
109
+ exc_val: Optional[BaseException],
110
+ exc_tb: Optional[TracebackType],
111
+ ) -> bool:
112
+ self.stop()
113
+ return exc_val is None
114
+
115
+ def run(self) -> None:
116
+ raise NotImplementedError()
@@ -0,0 +1,171 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+ import time
5
+ from argparse import ArgumentParser, Namespace
6
+ from multiprocessing import Process, Queue
7
+ from pathlib import Path
8
+ from typing import Any, Generic, Type, TypeVar
9
+
10
+ from typing_extensions import assert_never
11
+
12
+ from cognite.extractorutils.threading import CancellationToken
13
+ from cognite.extractorutils.unstable.configuration.loaders import load_file, load_from_cdf
14
+ from cognite.extractorutils.unstable.configuration.models import ConnectionConfig
15
+
16
+ from ._messaging import RuntimeMessage
17
+ from .base import ConfigRevision, ConfigType, Extractor
18
+
19
+ ExtractorType = TypeVar("ExtractorType", bound=Extractor)
20
+
21
+
22
+ class Runtime(Generic[ExtractorType]):
23
+ def __init__(
24
+ self,
25
+ extractor: Type[ExtractorType],
26
+ ) -> None:
27
+ self._extractor_class = extractor
28
+ self._cancellation_token = CancellationToken()
29
+ self._cancellation_token.cancel_on_interrupt()
30
+ self._message_queue: Queue[RuntimeMessage] = Queue()
31
+ self.logger = logging.getLogger(f"{self._extractor_class.EXTERNAL_ID}.runtime")
32
+ self._setup_logging()
33
+
34
+ def _create_argparser(self) -> ArgumentParser:
35
+ argparser = ArgumentParser(
36
+ prog=sys.argv[0],
37
+ description=self._extractor_class.DESCRIPTION,
38
+ )
39
+ argparser.add_argument(
40
+ "-v",
41
+ "--version",
42
+ action="version",
43
+ version=f"{self._extractor_class.NAME} v{self._extractor_class.VERSION}",
44
+ )
45
+ argparser.add_argument(
46
+ "-c",
47
+ "--connection-config",
48
+ nargs=1,
49
+ type=Path,
50
+ required=True,
51
+ help="Connection parameters",
52
+ )
53
+ argparser.add_argument(
54
+ "-l",
55
+ "--local-override",
56
+ nargs=1,
57
+ type=Path,
58
+ required=False,
59
+ default=None,
60
+ help="Include to use a local application configuration instead of fetching it from CDF",
61
+ )
62
+
63
+ return argparser
64
+
65
+ def _setup_logging(self) -> None:
66
+ # TODO: Figure out file logging for runtime
67
+ fmt = logging.Formatter(
68
+ "%(asctime)s.%(msecs)03d UTC [%(levelname)-8s] %(threadName)s - %(message)s",
69
+ "%Y-%m-%d %H:%M:%S",
70
+ )
71
+ # Set logging to UTC
72
+ fmt.converter = time.gmtime
73
+
74
+ root = logging.getLogger()
75
+ root.setLevel(logging.INFO)
76
+
77
+ console_handler = logging.StreamHandler()
78
+ console_handler.setFormatter(fmt)
79
+
80
+ root.addHandler(console_handler)
81
+
82
+ def _inner_run(
83
+ self,
84
+ message_queue: Queue,
85
+ connection_config: ConnectionConfig,
86
+ application_config: ConfigType,
87
+ current_config_revision: ConfigRevision,
88
+ ) -> None:
89
+ # This code is run inside the new extractor process
90
+ extractor = self._extractor_class.init_from_runtime(
91
+ connection_config,
92
+ application_config,
93
+ current_config_revision,
94
+ )
95
+ extractor._set_runtime_message_queue(message_queue)
96
+
97
+ try:
98
+ with extractor:
99
+ extractor.run()
100
+
101
+ except Exception:
102
+ self.logger.exception("Extractor crashed, will attempt restart")
103
+ message_queue.put(RuntimeMessage.RESTART)
104
+
105
+ def _spawn_extractor(
106
+ self,
107
+ connection_config: ConnectionConfig,
108
+ application_config: ConfigType,
109
+ current_config_revision: ConfigRevision,
110
+ ) -> Process:
111
+ self._message_queue = Queue()
112
+ process = Process(
113
+ target=self._inner_run,
114
+ args=(self._message_queue, connection_config, application_config, current_config_revision),
115
+ )
116
+
117
+ process.start()
118
+ self.logger.info(f"Started extractor as {process.pid}")
119
+ return process
120
+
121
+ def _get_application_config(
122
+ self,
123
+ args: Namespace,
124
+ connection_config: ConnectionConfig,
125
+ ) -> tuple[ConfigType, ConfigRevision]:
126
+ current_config_revision: ConfigRevision
127
+ if args.local_override:
128
+ current_config_revision = "local"
129
+ application_config = load_file(args.local_override[0], self._extractor_class.CONFIG_TYPE)
130
+ else:
131
+ client = connection_config.get_cognite_client(
132
+ f"{self._extractor_class.EXTERNAL_ID}-{self._extractor_class.VERSION}"
133
+ )
134
+ application_config, current_config_revision = load_from_cdf(
135
+ client,
136
+ connection_config.extraction_pipeline,
137
+ self._extractor_class.CONFIG_TYPE,
138
+ )
139
+
140
+ return application_config, current_config_revision
141
+
142
+ def run(self) -> None:
143
+ argparser = self._create_argparser()
144
+ args = argparser.parse_args()
145
+
146
+ self.logger.info(f"Started runtime as {os.getpid()}")
147
+
148
+ connection_config = load_file(args.connection_config[0], ConnectionConfig)
149
+
150
+ # This has to be Any. We don't know the type of the extractors' config at type checking since the sel doesn't
151
+ # exist yet, and I have not found a way to represent it in a generic way that isn't just an Any in disguise.
152
+ application_config: Any
153
+ while not self._cancellation_token.is_cancelled:
154
+ application_config, current_config_revision = self._get_application_config(args, connection_config)
155
+ # Start extractor in separate process, and wait for it to end
156
+ process = self._spawn_extractor(connection_config, application_config, current_config_revision)
157
+ process.join()
158
+
159
+ # Check if we are asked to restart the extractor, shut down otherwise
160
+ if not self._message_queue.empty():
161
+ message = self._message_queue.get_nowait()
162
+ match message:
163
+ case RuntimeMessage.RESTART:
164
+ continue
165
+
166
+ case _:
167
+ assert_never(message)
168
+
169
+ else:
170
+ self.logger.info("Shutting down runtime")
171
+ self._cancellation_token.cancel()
@@ -0,0 +1,3 @@
1
+ from ._scheduler import TaskScheduler
2
+
3
+ __all__ = ["TaskScheduler"]
@@ -0,0 +1,102 @@
1
+ from dataclasses import dataclass
2
+ from logging import getLogger
3
+ from threading import RLock, Thread
4
+ from time import time
5
+ from typing import Callable
6
+
7
+ import arrow
8
+ from humps import pascalize
9
+
10
+ from cognite.extractorutils.threading import CancellationToken
11
+ from cognite.extractorutils.unstable.configuration.models import CronConfig, IntervalConfig, ScheduleConfig
12
+ from cognite.extractorutils.unstable.scheduling._schedules import CronSchedule, IntervalSchedule, Schedule
13
+
14
+
15
+ @dataclass
16
+ class Job:
17
+ name: str
18
+ call: Callable[[], None]
19
+ schedule: Schedule
20
+
21
+ def __hash__(self) -> int:
22
+ return hash(self.name)
23
+
24
+
25
+ class TaskScheduler:
26
+ def __init__(self, cancellation_token: CancellationToken) -> None:
27
+ self._cancellation_token = cancellation_token
28
+ self._jobs: dict[str, Job] = {}
29
+ self._jobs_lock = RLock()
30
+ self._running: set[Job] = set()
31
+ self._running_lock = RLock()
32
+
33
+ self._logger = getLogger()
34
+
35
+ def schedule_task(self, name: str, schedule: ScheduleConfig, task: Callable[[], None]) -> None:
36
+ parsed_schedule: Schedule
37
+ match schedule:
38
+ case CronConfig() as cron_config:
39
+ parsed_schedule = CronSchedule(expression=cron_config.expression)
40
+
41
+ case IntervalConfig() as interval_config:
42
+ parsed_schedule = IntervalSchedule(interval=interval_config.expression.seconds)
43
+
44
+ with self._jobs_lock:
45
+ self._jobs[name] = Job(name=name, call=task, schedule=parsed_schedule)
46
+
47
+ def _get_next(self) -> list[Job]:
48
+ if not self._jobs:
49
+ return []
50
+ with self._jobs_lock:
51
+ next_runs = sorted([(j.schedule.next(), j) for j in self._jobs.values()], key=lambda tup: tup[0])
52
+ return [job for (next, job) in next_runs if next == next_runs[0][0]] if next_runs else []
53
+
54
+ def _run_job(self, job: Job) -> bool:
55
+ with self._running_lock:
56
+ if job in self._running:
57
+ self._logger.warning(f"Job {job.name} already running")
58
+ return False
59
+
60
+ def wrap() -> None:
61
+ with self._running_lock:
62
+ self._running.add(job)
63
+ try:
64
+ job.call()
65
+
66
+ self._logger.info(f"Job {job.name} done. Next run at {arrow.get(job.schedule.next()).isoformat()}")
67
+
68
+ finally:
69
+ with self._running_lock:
70
+ self._running.remove(job)
71
+
72
+ Thread(target=wrap, name=f"Run{pascalize(job.name)}").start()
73
+ return True
74
+
75
+ def trigger(self, name: str) -> bool:
76
+ return self._run_job(self._jobs[name])
77
+
78
+ def run(self) -> None:
79
+ if not self._jobs:
80
+ raise ValueError("Can't run scheduler without any scheduled tasks")
81
+
82
+ # Run all interval jobs on startup since the first next() is one interval from now
83
+ for job in [j for j in self._jobs.values() if isinstance(j.schedule, IntervalSchedule)]:
84
+ self.trigger(job.name)
85
+
86
+ while not self._cancellation_token.is_cancelled:
87
+ next_runs = self._get_next()
88
+
89
+ next_time = next_runs[0].schedule.next()
90
+ wait_time = max(next_time - time(), 0)
91
+
92
+ if wait_time:
93
+ self._logger.info(f"Waiting until {arrow.get(next_time).isoformat()}")
94
+ if self._cancellation_token.wait(wait_time):
95
+ break
96
+
97
+ for job in next_runs:
98
+ self._logger.info(f"Starting job {job.name}")
99
+ self._run_job(job)
100
+
101
+ def stop(self) -> None:
102
+ self._cancellation_token.cancel()
@@ -0,0 +1,31 @@
1
+ from abc import ABC, abstractmethod
2
+ from time import time
3
+
4
+ from croniter import croniter
5
+
6
+
7
+ class Schedule(ABC):
8
+ @abstractmethod
9
+ def next(self) -> int:
10
+ pass
11
+
12
+
13
+ class CronSchedule(Schedule):
14
+ def __init__(self, expression: str) -> None:
15
+ self._cron = croniter(expression)
16
+
17
+ def next(self) -> int:
18
+ return int(self._cron.get_next(start_time=time()))
19
+
20
+
21
+ class IntervalSchedule(Schedule):
22
+ def __init__(self, interval: int) -> None:
23
+ self._interval = interval
24
+ self._next = int(time())
25
+
26
+ def next(self) -> int:
27
+ t = time()
28
+ while t > self._next:
29
+ self._next += self._interval
30
+
31
+ return self._next
@@ -19,6 +19,7 @@ from math import ceil
19
19
  from os import PathLike
20
20
  from types import TracebackType
21
21
  from typing import Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union
22
+ from urllib.parse import ParseResult, urlparse
22
23
 
23
24
  from httpx import URL, Client, Headers, Request, StreamConsumed, SyncByteStream
24
25
  from requests.utils import super_len
@@ -408,12 +409,22 @@ class IOFileUploadQueue(AbstractUploadQueue):
408
409
  self, url_str: str, stream: BinaryIO, size: int, mime_type: Optional[str] = None
409
410
  ) -> Request:
410
411
  url = URL(url_str)
412
+ base_url = URL(self.cdf_client.config.base_url)
413
+
414
+ if url.host == base_url.host:
415
+ upload_url = url
416
+ else:
417
+ parsed_url: ParseResult = urlparse(url_str)
418
+ parsed_base_url: ParseResult = urlparse(self.cdf_client.config.base_url)
419
+ replaced_upload_url = parsed_url._replace(netloc=parsed_base_url.netloc).geturl()
420
+ upload_url = URL(replaced_upload_url)
421
+
411
422
  headers = Headers(self._httpx_client.headers)
412
423
  headers.update(
413
424
  {
414
425
  "Accept": "*/*",
415
426
  "Content-Length": str(size),
416
- "Host": url.netloc.decode("ascii"),
427
+ "Host": upload_url.netloc.decode("ascii"),
417
428
  "x-cdp-app": self.cdf_client._config.client_name,
418
429
  }
419
430
  )
@@ -423,7 +434,7 @@ class IOFileUploadQueue(AbstractUploadQueue):
423
434
 
424
435
  return Request(
425
436
  method="PUT",
426
- url=url,
437
+ url=upload_url,
427
438
  stream=IOByteStream(stream),
428
439
  headers=headers,
429
440
  )
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cognite-extractor-utils"
3
- version = "7.4.7"
3
+ version = "7.4.8"
4
4
  description = "Utilities for easier development of extractors for CDF"
5
5
  authors = ["Mathias Lohne <mathias.lohne@cognite.com>"]
6
6
  license = "Apache-2.0"
@@ -74,6 +74,7 @@ orjson = "^3.10.3"
74
74
  httpx = "^0.27.0"
75
75
  pydantic = "^2.8.2"
76
76
  pyhumps = "^3.8.0"
77
+ croniter = "^3.0.3"
77
78
 
78
79
  [tool.poetry.extras]
79
80
  experimental = ["cognite-sdk-experimental"]
@@ -93,7 +94,7 @@ parameterized = "*"
93
94
  requests = "^2.31.0"
94
95
  types-requests = "^2.31.0.20240125"
95
96
  httpx = "^0.27.0"
96
- faker = "^29.0.0"
97
+ faker = "^30.0.0"
97
98
 
98
99
  [build-system]
99
100
  requires = ["poetry-core>=1.0.0"]