cognite-extractor-utils 7.4.6__tar.gz → 7.4.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-extractor-utils might be problematic. Click here for more details.

Files changed (44) hide show
  1. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/PKG-INFO +2 -1
  2. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/__init__.py +1 -1
  3. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/configtools/_util.py +4 -2
  4. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/configtools/elements.py +48 -0
  5. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/configtools/loaders.py +21 -1
  6. cognite_extractor_utils-7.4.8/cognite/extractorutils/configtools/validators.py +37 -0
  7. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/unstable/configuration/loaders.py +3 -3
  8. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/unstable/configuration/models.py +82 -2
  9. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/__init__.py +0 -0
  10. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/__main__.py +31 -0
  11. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/_messaging.py +5 -0
  12. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/base.py +116 -0
  13. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/core/runtime.py +171 -0
  14. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/scheduling/__init__.py +3 -0
  15. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/scheduling/_scheduler.py +102 -0
  16. cognite_extractor_utils-7.4.8/cognite/extractorutils/unstable/scheduling/_schedules.py +31 -0
  17. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/files.py +19 -6
  18. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/pyproject.toml +3 -2
  19. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/LICENSE +0 -0
  20. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/README.md +0 -0
  21. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/_inner_util.py +0 -0
  22. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/base.py +0 -0
  23. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/configtools/__init__.py +0 -0
  24. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/exceptions.py +0 -0
  25. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/metrics.py +0 -0
  26. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/py.typed +0 -0
  27. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/statestore/__init__.py +0 -0
  28. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/statestore/_base.py +0 -0
  29. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/statestore/hashing.py +0 -0
  30. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/statestore/watermark.py +0 -0
  31. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/threading.py +0 -0
  32. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/unstable/__init__.py +0 -0
  33. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/unstable/configuration/__init__.py +0 -0
  34. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/__init__.py +0 -0
  35. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/_base.py +0 -0
  36. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/_metrics.py +0 -0
  37. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/assets.py +0 -0
  38. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/data_modeling.py +0 -0
  39. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/events.py +0 -0
  40. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/raw.py +0 -0
  41. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader/time_series.py +0 -0
  42. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader_extractor.py +0 -0
  43. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/uploader_types.py +0 -0
  44. {cognite_extractor_utils-7.4.6 → cognite_extractor_utils-7.4.8}/cognite/extractorutils/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cognite-extractor-utils
3
- Version: 7.4.6
3
+ Version: 7.4.8
4
4
  Summary: Utilities for easier development of extractors for CDF
5
5
  Home-page: https://github.com/cognitedata/python-extractor-utils
6
6
  License: Apache-2.0
@@ -18,6 +18,7 @@ Requires-Dist: arrow (>=1.0.0,<2.0.0)
18
18
  Requires-Dist: azure-identity (>=1.14.0,<2.0.0)
19
19
  Requires-Dist: azure-keyvault-secrets (>=4.7.0,<5.0.0)
20
20
  Requires-Dist: cognite-sdk (>=7.59.0,<8.0.0)
21
+ Requires-Dist: croniter (>=3.0.3,<4.0.0)
21
22
  Requires-Dist: dacite (>=1.6.0,<2.0.0)
22
23
  Requires-Dist: decorator (>=5.1.1,<6.0.0)
23
24
  Requires-Dist: httpx (>=0.27.0,<0.28.0)
@@ -16,5 +16,5 @@
16
16
  Cognite extractor utils is a Python package that simplifies the development of new extractors.
17
17
  """
18
18
 
19
- __version__ = "7.4.6"
19
+ __version__ = "7.4.8"
20
20
  from .base import Extractor
@@ -81,8 +81,10 @@ def _to_snake_case(dictionary: Dict[str, Any], case_style: str) -> Dict[str, Any
81
81
  raise ValueError(f"Invalid case style: {case_style}")
82
82
 
83
83
 
84
- def _load_certificate_data(cert_path: str, password: Optional[str]) -> Union[Tuple[str, str], Tuple[bytes, bytes]]:
85
- path = Path(cert_path)
84
+ def _load_certificate_data(
85
+ cert_path: str | Path, password: Optional[str]
86
+ ) -> Union[Tuple[str, str], Tuple[bytes, bytes]]:
87
+ path = Path(cert_path) if isinstance(cert_path, str) else cert_path
86
88
  cert_data = Path(path).read_bytes()
87
89
 
88
90
  if path.suffix == ".pem":
@@ -696,3 +696,51 @@ class StateStoreConfig:
696
696
  return LocalStateStore(file_path="states.json", cancellation_token=cancellation_token)
697
697
  else:
698
698
  return NoStateStore()
699
+
700
+
701
+ class RegExpFlag(Enum):
702
+ IGNORECASE = "ignore-case"
703
+ IC = "i"
704
+ ASCII = "ascii-only"
705
+ A = "a"
706
+
707
+ def get_regex_flag(self) -> int:
708
+ if self in (RegExpFlag.IGNORECASE, RegExpFlag.IC):
709
+ return re.IGNORECASE
710
+ elif self.value in (RegExpFlag.ASCII, RegExpFlag.A):
711
+ return re.ASCII
712
+ return 0
713
+
714
+
715
+ @dataclass
716
+ class IgnorePattern:
717
+ """
718
+ Configuration for regexp for ignore pattern
719
+ """
720
+
721
+ pattern: str
722
+ options: Optional[list[RegExpFlag]] = None
723
+ flags: Optional[list[RegExpFlag]] = None
724
+
725
+ def compile(self) -> re.Pattern[str]:
726
+ """
727
+ Compile RegExp pattern.
728
+
729
+ Returns:
730
+ Compiled pattern.
731
+ """
732
+ flag = 0
733
+ for f in self.options or []:
734
+ flag |= f.get_regex_flag()
735
+ return re.compile(self.pattern, flag)
736
+
737
+ def __post_init__(self) -> None:
738
+ if self.options is not None and self.flags is not None:
739
+ raise ValueError("Only one of either 'options' or 'flags' can be specified.")
740
+ if self.options is None and self.flags is None:
741
+ raise ValueError("'options' is required.")
742
+
743
+ if self.flags is not None:
744
+ _logger.warning("'options' is preferred over 'flags' as this may be removed in a future release")
745
+ self.options = self.flags
746
+ self.flags = None
@@ -22,7 +22,7 @@ import sys
22
22
  from enum import Enum
23
23
  from hashlib import sha256
24
24
  from pathlib import Path
25
- from typing import Any, Callable, Dict, Generic, Iterable, Optional, TextIO, Type, TypeVar, Union, cast
25
+ from typing import Any, Callable, Dict, Generic, Iterable, List, Optional, TextIO, Type, TypeVar, Union, cast
26
26
 
27
27
  import dacite
28
28
  import yaml
@@ -37,6 +37,7 @@ from cognite.extractorutils.configtools._util import _to_snake_case
37
37
  from cognite.extractorutils.configtools.elements import (
38
38
  BaseConfig,
39
39
  ConfigType,
40
+ IgnorePattern,
40
41
  TimeIntervalConfig,
41
42
  _BaseConfig,
42
43
  )
@@ -320,6 +321,25 @@ def load_yaml_dict(
320
321
  )
321
322
 
322
323
 
324
+ def compile_patterns(ignore_patterns: List[Union[str, IgnorePattern]]) -> list[re.Pattern[str]]:
325
+ """
326
+ List of patterns to compile
327
+
328
+ Args:
329
+ ignore_patterns: A list of strings or IgnorePattern to be compiled.
330
+
331
+ Returns:
332
+ A list of compiled RegExp patterns.
333
+ """
334
+ compiled = []
335
+ for p in ignore_patterns:
336
+ if isinstance(p, IgnorePattern):
337
+ compiled.append(re.compile(p.compile()))
338
+ else:
339
+ compiled.append(re.compile(p))
340
+ return compiled
341
+
342
+
323
343
  class ConfigResolver(Generic[CustomConfigClass]):
324
344
  def __init__(self, config_path: str, config_type: Type[CustomConfigClass]):
325
345
  self.config_path = config_path
@@ -0,0 +1,37 @@
1
+ import logging
2
+ import re
3
+ from typing import Union
4
+
5
+ _logger = logging.getLogger(__name__)
6
+
7
+
8
+ def matches_patterns(patterns: list[Union[str, re.Pattern[str]]], string: str) -> bool:
9
+ """
10
+ Check string against list of RegExp patterns.
11
+
12
+ Args:
13
+ patterns: A list of (re) patterns to match string against.
14
+ string: String to which we match the pattern.
15
+
16
+ Returns:
17
+ boolean value indicating whether string matches any of the patterns.
18
+ """
19
+ return any([matches_pattern(pattern, string) for pattern in patterns])
20
+
21
+
22
+ def matches_pattern(pattern: Union[str, re.Pattern[str]], string: str) -> bool:
23
+ """
24
+ Match pattern against a string.
25
+
26
+ Args:
27
+ pattern: (re) Pattern to match against a string.
28
+ string: String to which we match the pattern.
29
+
30
+ Returns:
31
+ boolean value indicating a match or otherwise.
32
+ """
33
+ try:
34
+ return re.search(pattern, string) is not None
35
+ except re.error as e:
36
+ _logger.warning(f"Could not apply RegExp: {pattern}\nReason: {e}")
37
+ return False
@@ -2,7 +2,7 @@ import json
2
2
  from enum import Enum
3
3
  from io import StringIO
4
4
  from pathlib import Path
5
- from typing import Dict, Optional, TextIO, Type, TypeVar, Union
5
+ from typing import Dict, Optional, TextIO, Tuple, Type, TypeVar, Union
6
6
 
7
7
  from pydantic import ValidationError
8
8
 
@@ -33,7 +33,7 @@ def load_file(path: Path, schema: Type[_T]) -> _T:
33
33
 
34
34
  def load_from_cdf(
35
35
  cognite_client: CogniteClient, external_id: str, schema: Type[_T], revision: Optional[int] = None
36
- ) -> _T:
36
+ ) -> Tuple[_T, int]:
37
37
  params: Dict[str, Union[str, int]] = {"externalId": external_id}
38
38
  if revision:
39
39
  params["revision"] = revision
@@ -44,7 +44,7 @@ def load_from_cdf(
44
44
  )
45
45
  response.raise_for_status()
46
46
  data = response.json()
47
- return load_io(StringIO(data["config"]), ConfigFormat.YAML, schema)
47
+ return load_io(StringIO(data["config"]), ConfigFormat.YAML, schema), data["revision"]
48
48
 
49
49
 
50
50
  def load_io(stream: TextIO, format: ConfigFormat, schema: Type[_T]) -> _T:
@@ -7,7 +7,16 @@ from typing import Annotated, Any, Dict, List, Literal, Optional, Union
7
7
  from humps import kebabize
8
8
  from pydantic import BaseModel, ConfigDict, Field, GetCoreSchemaHandler
9
9
  from pydantic_core import CoreSchema, core_schema
10
-
10
+ from typing_extensions import assert_never
11
+
12
+ from cognite.client import CogniteClient
13
+ from cognite.client.config import ClientConfig
14
+ from cognite.client.credentials import (
15
+ CredentialProvider,
16
+ OAuthClientCertificate,
17
+ OAuthClientCredentials,
18
+ )
19
+ from cognite.extractorutils.configtools._util import _load_certificate_data
11
20
  from cognite.extractorutils.exceptions import InvalidConfigError
12
21
 
13
22
 
@@ -33,7 +42,9 @@ class _ClientCredentialsConfig(ConfigModel):
33
42
  class _ClientCertificateConfig(ConfigModel):
34
43
  type: Literal["client-certificate"]
35
44
  client_id: str
36
- certificate_path: Path
45
+ path: Path
46
+ password: Optional[str] = None
47
+ authority_url: str
37
48
  scopes: List[str]
38
49
 
39
50
 
@@ -121,6 +132,7 @@ class _ConnectionParameters(ConfigModel):
121
132
  max_connection_pool_size: int = 50
122
133
  ssl_verify: bool = True
123
134
  proxies: Dict[str, str] = Field(default_factory=dict)
135
+ timeout: TimeIntervalConfig = Field(default_factory=lambda: TimeIntervalConfig("30s"))
124
136
 
125
137
 
126
138
  class ConnectionConfig(ConfigModel):
@@ -133,6 +145,74 @@ class ConnectionConfig(ConfigModel):
133
145
 
134
146
  connection: _ConnectionParameters = Field(default_factory=_ConnectionParameters)
135
147
 
148
+ def get_cognite_client(self, client_name: str) -> CogniteClient:
149
+ from cognite.client.config import global_config
150
+
151
+ global_config.disable_pypi_version_check = True
152
+ global_config.disable_gzip = not self.connection.gzip_compression
153
+ global_config.status_forcelist = set(self.connection.status_forcelist)
154
+ global_config.max_retries = self.connection.max_retries
155
+ global_config.max_retries_connect = self.connection.max_retries_connect
156
+ global_config.max_retry_backoff = self.connection.max_retry_backoff.seconds
157
+ global_config.max_connection_pool_size = self.connection.max_connection_pool_size
158
+ global_config.disable_ssl = not self.connection.ssl_verify
159
+ global_config.proxies = self.connection.proxies
160
+
161
+ credential_provider: CredentialProvider
162
+ match self.authentication:
163
+ case _ClientCredentialsConfig() as client_credentials:
164
+ kwargs = {
165
+ "token_url": client_credentials.token_url,
166
+ "client_id": client_credentials.client_id,
167
+ "client_secret": client_credentials.client_secret,
168
+ "scopes": client_credentials.scopes,
169
+ }
170
+ if client_credentials.audience is not None:
171
+ kwargs["audience"] = client_credentials.audience
172
+ if client_credentials.resource is not None:
173
+ kwargs["resource"] = client_credentials.resource
174
+
175
+ credential_provider = OAuthClientCredentials(**kwargs) # type: ignore # I know what I'm doing
176
+
177
+ case _ClientCertificateConfig() as client_certificate:
178
+ thumbprint, key = _load_certificate_data(
179
+ client_certificate.path,
180
+ client_certificate.password,
181
+ )
182
+ credential_provider = OAuthClientCertificate(
183
+ authority_url=client_certificate.authority_url,
184
+ client_id=client_certificate.client_id,
185
+ cert_thumbprint=str(thumbprint),
186
+ certificate=str(key),
187
+ scopes=client_certificate.scopes,
188
+ )
189
+
190
+ case _:
191
+ assert_never(self.authentication)
192
+
193
+ client_config = ClientConfig(
194
+ project=self.project,
195
+ base_url=self.base_url,
196
+ client_name=client_name,
197
+ timeout=self.connection.timeout.seconds,
198
+ credentials=credential_provider,
199
+ )
200
+
201
+ return CogniteClient(client_config)
202
+
203
+
204
+ class CronConfig(ConfigModel):
205
+ type: Literal["cron"]
206
+ expression: str
207
+
208
+
209
+ class IntervalConfig(ConfigModel):
210
+ type: Literal["interval"]
211
+ expression: TimeIntervalConfig
212
+
213
+
214
+ ScheduleConfig = Annotated[CronConfig | IntervalConfig, Field(discriminator="type")]
215
+
136
216
 
137
217
  class LogLevel(Enum):
138
218
  CRITICAL = "CRITICAL"
@@ -0,0 +1,31 @@
1
+ """
2
+ Example of how you would build an extractor with the new base class
3
+ """
4
+
5
+ from cognite.extractorutils.unstable.configuration.models import ExtractorConfig
6
+
7
+ from .base import Extractor
8
+ from .runtime import Runtime
9
+
10
+
11
+ class MyConfig(ExtractorConfig):
12
+ parameter_one: int
13
+ parameter_two: str
14
+
15
+
16
+ class MyExtractor(Extractor[MyConfig]):
17
+ NAME = "Test extractor"
18
+ EXTERNAL_ID = "test-extractor"
19
+ DESCRIPTION = "Test of the new runtime"
20
+ VERSION = "1.0.0"
21
+ CONFIG_TYPE = MyConfig
22
+
23
+ def run(self) -> None:
24
+ self.logger.info("Started!")
25
+ if not self.cancellation_token.wait(10):
26
+ raise ValueError("Oops")
27
+
28
+
29
+ if __name__ == "__main__":
30
+ runtime = Runtime(MyExtractor)
31
+ runtime.run()
@@ -0,0 +1,5 @@
1
+ from enum import Enum
2
+
3
+
4
+ class RuntimeMessage(Enum):
5
+ RESTART = 1
@@ -0,0 +1,116 @@
1
+ import logging
2
+ from multiprocessing import Queue
3
+ from threading import RLock, Thread
4
+ from types import TracebackType
5
+ from typing import Generic, Literal, Optional, Type, TypeVar, Union
6
+
7
+ from typing_extensions import Self
8
+
9
+ from cognite.extractorutils.threading import CancellationToken
10
+ from cognite.extractorutils.unstable.configuration.models import ConnectionConfig, ExtractorConfig
11
+ from cognite.extractorutils.unstable.core._messaging import RuntimeMessage
12
+
13
+ ConfigType = TypeVar("ConfigType", bound=ExtractorConfig)
14
+ ConfigRevision = Union[Literal["local"], int]
15
+
16
+
17
+ class Extractor(Generic[ConfigType]):
18
+ NAME: str
19
+ EXTERNAL_ID: str
20
+ DESCRIPTION: str
21
+ VERSION: str
22
+
23
+ CONFIG_TYPE: Type[ConfigType]
24
+
25
+ def __init__(
26
+ self,
27
+ connection_config: ConnectionConfig,
28
+ application_config: ConfigType,
29
+ current_config_revision: ConfigRevision,
30
+ ) -> None:
31
+ self.cancellation_token = CancellationToken()
32
+ self.cancellation_token.cancel_on_interrupt()
33
+
34
+ self.connection_config = connection_config
35
+ self.application_config = application_config
36
+ self.current_config_revision = current_config_revision
37
+
38
+ self.cognite_client = self.connection_config.get_cognite_client(f"{self.EXTERNAL_ID}-{self.VERSION}")
39
+
40
+ self._checkin_lock = RLock()
41
+ self._runtime_messages: Optional[Queue[RuntimeMessage]] = None
42
+
43
+ self.logger = logging.getLogger(f"{self.EXTERNAL_ID}.main")
44
+
45
+ def _set_runtime_message_queue(self, queue: Queue) -> None:
46
+ self._runtime_messages = queue
47
+
48
+ def _run_checkin(self) -> None:
49
+ def checkin() -> None:
50
+ body = {"externalId": self.connection_config.extraction_pipeline}
51
+
52
+ with self._checkin_lock:
53
+ res = self.cognite_client.post(
54
+ f"/api/v1/projects/{self.cognite_client.config.project}/odin/checkin",
55
+ json=body,
56
+ headers={"cdf-version": "alpha"},
57
+ )
58
+ new_config_revision = res.json().get("lastConfigRevision")
59
+
60
+ if new_config_revision and new_config_revision != self.current_config_revision:
61
+ self.restart()
62
+
63
+ while not self.cancellation_token.is_cancelled:
64
+ try:
65
+ checkin()
66
+ except Exception:
67
+ self.logger.exception("Error during checkin")
68
+ self.cancellation_token.wait(10)
69
+
70
+ def restart(self) -> None:
71
+ if self._runtime_messages:
72
+ self._runtime_messages.put(RuntimeMessage.RESTART)
73
+ self.cancellation_token.cancel()
74
+
75
+ @classmethod
76
+ def init_from_runtime(
77
+ cls,
78
+ connection_config: ConnectionConfig,
79
+ application_config: ConfigType,
80
+ current_config_revision: ConfigRevision,
81
+ ) -> Self:
82
+ return cls(connection_config, application_config, current_config_revision)
83
+
84
+ def start(self) -> None:
85
+ self.cognite_client.post(
86
+ f"/api/v1/projects/{self.cognite_client.config.project}/odin/extractorinfo",
87
+ json={
88
+ "externalId": self.connection_config.extraction_pipeline,
89
+ "activeConfigRevision": self.current_config_revision,
90
+ "extractor": {
91
+ "version": self.VERSION,
92
+ "externalId": self.EXTERNAL_ID,
93
+ },
94
+ },
95
+ headers={"cdf-version": "alpha"},
96
+ )
97
+ Thread(target=self._run_checkin, name="ExtractorCheckin", daemon=True).start()
98
+
99
+ def stop(self) -> None:
100
+ self.cancellation_token.cancel()
101
+
102
+ def __enter__(self) -> Self:
103
+ self.start()
104
+ return self
105
+
106
+ def __exit__(
107
+ self,
108
+ exc_type: Optional[Type[BaseException]],
109
+ exc_val: Optional[BaseException],
110
+ exc_tb: Optional[TracebackType],
111
+ ) -> bool:
112
+ self.stop()
113
+ return exc_val is None
114
+
115
+ def run(self) -> None:
116
+ raise NotImplementedError()
@@ -0,0 +1,171 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+ import time
5
+ from argparse import ArgumentParser, Namespace
6
+ from multiprocessing import Process, Queue
7
+ from pathlib import Path
8
+ from typing import Any, Generic, Type, TypeVar
9
+
10
+ from typing_extensions import assert_never
11
+
12
+ from cognite.extractorutils.threading import CancellationToken
13
+ from cognite.extractorutils.unstable.configuration.loaders import load_file, load_from_cdf
14
+ from cognite.extractorutils.unstable.configuration.models import ConnectionConfig
15
+
16
+ from ._messaging import RuntimeMessage
17
+ from .base import ConfigRevision, ConfigType, Extractor
18
+
19
+ ExtractorType = TypeVar("ExtractorType", bound=Extractor)
20
+
21
+
22
+ class Runtime(Generic[ExtractorType]):
23
+ def __init__(
24
+ self,
25
+ extractor: Type[ExtractorType],
26
+ ) -> None:
27
+ self._extractor_class = extractor
28
+ self._cancellation_token = CancellationToken()
29
+ self._cancellation_token.cancel_on_interrupt()
30
+ self._message_queue: Queue[RuntimeMessage] = Queue()
31
+ self.logger = logging.getLogger(f"{self._extractor_class.EXTERNAL_ID}.runtime")
32
+ self._setup_logging()
33
+
34
+ def _create_argparser(self) -> ArgumentParser:
35
+ argparser = ArgumentParser(
36
+ prog=sys.argv[0],
37
+ description=self._extractor_class.DESCRIPTION,
38
+ )
39
+ argparser.add_argument(
40
+ "-v",
41
+ "--version",
42
+ action="version",
43
+ version=f"{self._extractor_class.NAME} v{self._extractor_class.VERSION}",
44
+ )
45
+ argparser.add_argument(
46
+ "-c",
47
+ "--connection-config",
48
+ nargs=1,
49
+ type=Path,
50
+ required=True,
51
+ help="Connection parameters",
52
+ )
53
+ argparser.add_argument(
54
+ "-l",
55
+ "--local-override",
56
+ nargs=1,
57
+ type=Path,
58
+ required=False,
59
+ default=None,
60
+ help="Include to use a local application configuration instead of fetching it from CDF",
61
+ )
62
+
63
+ return argparser
64
+
65
+ def _setup_logging(self) -> None:
66
+ # TODO: Figure out file logging for runtime
67
+ fmt = logging.Formatter(
68
+ "%(asctime)s.%(msecs)03d UTC [%(levelname)-8s] %(threadName)s - %(message)s",
69
+ "%Y-%m-%d %H:%M:%S",
70
+ )
71
+ # Set logging to UTC
72
+ fmt.converter = time.gmtime
73
+
74
+ root = logging.getLogger()
75
+ root.setLevel(logging.INFO)
76
+
77
+ console_handler = logging.StreamHandler()
78
+ console_handler.setFormatter(fmt)
79
+
80
+ root.addHandler(console_handler)
81
+
82
+ def _inner_run(
83
+ self,
84
+ message_queue: Queue,
85
+ connection_config: ConnectionConfig,
86
+ application_config: ConfigType,
87
+ current_config_revision: ConfigRevision,
88
+ ) -> None:
89
+ # This code is run inside the new extractor process
90
+ extractor = self._extractor_class.init_from_runtime(
91
+ connection_config,
92
+ application_config,
93
+ current_config_revision,
94
+ )
95
+ extractor._set_runtime_message_queue(message_queue)
96
+
97
+ try:
98
+ with extractor:
99
+ extractor.run()
100
+
101
+ except Exception:
102
+ self.logger.exception("Extractor crashed, will attempt restart")
103
+ message_queue.put(RuntimeMessage.RESTART)
104
+
105
+ def _spawn_extractor(
106
+ self,
107
+ connection_config: ConnectionConfig,
108
+ application_config: ConfigType,
109
+ current_config_revision: ConfigRevision,
110
+ ) -> Process:
111
+ self._message_queue = Queue()
112
+ process = Process(
113
+ target=self._inner_run,
114
+ args=(self._message_queue, connection_config, application_config, current_config_revision),
115
+ )
116
+
117
+ process.start()
118
+ self.logger.info(f"Started extractor as {process.pid}")
119
+ return process
120
+
121
+ def _get_application_config(
122
+ self,
123
+ args: Namespace,
124
+ connection_config: ConnectionConfig,
125
+ ) -> tuple[ConfigType, ConfigRevision]:
126
+ current_config_revision: ConfigRevision
127
+ if args.local_override:
128
+ current_config_revision = "local"
129
+ application_config = load_file(args.local_override[0], self._extractor_class.CONFIG_TYPE)
130
+ else:
131
+ client = connection_config.get_cognite_client(
132
+ f"{self._extractor_class.EXTERNAL_ID}-{self._extractor_class.VERSION}"
133
+ )
134
+ application_config, current_config_revision = load_from_cdf(
135
+ client,
136
+ connection_config.extraction_pipeline,
137
+ self._extractor_class.CONFIG_TYPE,
138
+ )
139
+
140
+ return application_config, current_config_revision
141
+
142
+ def run(self) -> None:
143
+ argparser = self._create_argparser()
144
+ args = argparser.parse_args()
145
+
146
+ self.logger.info(f"Started runtime as {os.getpid()}")
147
+
148
+ connection_config = load_file(args.connection_config[0], ConnectionConfig)
149
+
150
+ # This has to be Any. We don't know the type of the extractors' config at type checking since the sel doesn't
151
+ # exist yet, and I have not found a way to represent it in a generic way that isn't just an Any in disguise.
152
+ application_config: Any
153
+ while not self._cancellation_token.is_cancelled:
154
+ application_config, current_config_revision = self._get_application_config(args, connection_config)
155
+ # Start extractor in separate process, and wait for it to end
156
+ process = self._spawn_extractor(connection_config, application_config, current_config_revision)
157
+ process.join()
158
+
159
+ # Check if we are asked to restart the extractor, shut down otherwise
160
+ if not self._message_queue.empty():
161
+ message = self._message_queue.get_nowait()
162
+ match message:
163
+ case RuntimeMessage.RESTART:
164
+ continue
165
+
166
+ case _:
167
+ assert_never(message)
168
+
169
+ else:
170
+ self.logger.info("Shutting down runtime")
171
+ self._cancellation_token.cancel()
@@ -0,0 +1,3 @@
1
+ from ._scheduler import TaskScheduler
2
+
3
+ __all__ = ["TaskScheduler"]
@@ -0,0 +1,102 @@
1
+ from dataclasses import dataclass
2
+ from logging import getLogger
3
+ from threading import RLock, Thread
4
+ from time import time
5
+ from typing import Callable
6
+
7
+ import arrow
8
+ from humps import pascalize
9
+
10
+ from cognite.extractorutils.threading import CancellationToken
11
+ from cognite.extractorutils.unstable.configuration.models import CronConfig, IntervalConfig, ScheduleConfig
12
+ from cognite.extractorutils.unstable.scheduling._schedules import CronSchedule, IntervalSchedule, Schedule
13
+
14
+
15
+ @dataclass
16
+ class Job:
17
+ name: str
18
+ call: Callable[[], None]
19
+ schedule: Schedule
20
+
21
+ def __hash__(self) -> int:
22
+ return hash(self.name)
23
+
24
+
25
+ class TaskScheduler:
26
+ def __init__(self, cancellation_token: CancellationToken) -> None:
27
+ self._cancellation_token = cancellation_token
28
+ self._jobs: dict[str, Job] = {}
29
+ self._jobs_lock = RLock()
30
+ self._running: set[Job] = set()
31
+ self._running_lock = RLock()
32
+
33
+ self._logger = getLogger()
34
+
35
+ def schedule_task(self, name: str, schedule: ScheduleConfig, task: Callable[[], None]) -> None:
36
+ parsed_schedule: Schedule
37
+ match schedule:
38
+ case CronConfig() as cron_config:
39
+ parsed_schedule = CronSchedule(expression=cron_config.expression)
40
+
41
+ case IntervalConfig() as interval_config:
42
+ parsed_schedule = IntervalSchedule(interval=interval_config.expression.seconds)
43
+
44
+ with self._jobs_lock:
45
+ self._jobs[name] = Job(name=name, call=task, schedule=parsed_schedule)
46
+
47
+ def _get_next(self) -> list[Job]:
48
+ if not self._jobs:
49
+ return []
50
+ with self._jobs_lock:
51
+ next_runs = sorted([(j.schedule.next(), j) for j in self._jobs.values()], key=lambda tup: tup[0])
52
+ return [job for (next, job) in next_runs if next == next_runs[0][0]] if next_runs else []
53
+
54
+ def _run_job(self, job: Job) -> bool:
55
+ with self._running_lock:
56
+ if job in self._running:
57
+ self._logger.warning(f"Job {job.name} already running")
58
+ return False
59
+
60
+ def wrap() -> None:
61
+ with self._running_lock:
62
+ self._running.add(job)
63
+ try:
64
+ job.call()
65
+
66
+ self._logger.info(f"Job {job.name} done. Next run at {arrow.get(job.schedule.next()).isoformat()}")
67
+
68
+ finally:
69
+ with self._running_lock:
70
+ self._running.remove(job)
71
+
72
+ Thread(target=wrap, name=f"Run{pascalize(job.name)}").start()
73
+ return True
74
+
75
+ def trigger(self, name: str) -> bool:
76
+ return self._run_job(self._jobs[name])
77
+
78
+ def run(self) -> None:
79
+ if not self._jobs:
80
+ raise ValueError("Can't run scheduler without any scheduled tasks")
81
+
82
+ # Run all interval jobs on startup since the first next() is one interval from now
83
+ for job in [j for j in self._jobs.values() if isinstance(j.schedule, IntervalSchedule)]:
84
+ self.trigger(job.name)
85
+
86
+ while not self._cancellation_token.is_cancelled:
87
+ next_runs = self._get_next()
88
+
89
+ next_time = next_runs[0].schedule.next()
90
+ wait_time = max(next_time - time(), 0)
91
+
92
+ if wait_time:
93
+ self._logger.info(f"Waiting until {arrow.get(next_time).isoformat()}")
94
+ if self._cancellation_token.wait(wait_time):
95
+ break
96
+
97
+ for job in next_runs:
98
+ self._logger.info(f"Starting job {job.name}")
99
+ self._run_job(job)
100
+
101
+ def stop(self) -> None:
102
+ self._cancellation_token.cancel()
@@ -0,0 +1,31 @@
1
+ from abc import ABC, abstractmethod
2
+ from time import time
3
+
4
+ from croniter import croniter
5
+
6
+
7
+ class Schedule(ABC):
8
+ @abstractmethod
9
+ def next(self) -> int:
10
+ pass
11
+
12
+
13
+ class CronSchedule(Schedule):
14
+ def __init__(self, expression: str) -> None:
15
+ self._cron = croniter(expression)
16
+
17
+ def next(self) -> int:
18
+ return int(self._cron.get_next(start_time=time()))
19
+
20
+
21
+ class IntervalSchedule(Schedule):
22
+ def __init__(self, interval: int) -> None:
23
+ self._interval = interval
24
+ self._next = int(time())
25
+
26
+ def next(self) -> int:
27
+ t = time()
28
+ while t > self._next:
29
+ self._next += self._interval
30
+
31
+ return self._next
@@ -19,6 +19,7 @@ from math import ceil
19
19
  from os import PathLike
20
20
  from types import TracebackType
21
21
  from typing import Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union
22
+ from urllib.parse import ParseResult, urlparse
22
23
 
23
24
  from httpx import URL, Client, Headers, Request, StreamConsumed, SyncByteStream
24
25
  from requests.utils import super_len
@@ -255,18 +256,20 @@ class IOFileUploadQueue(AbstractUploadQueue):
255
256
  ) -> tuple[FileMetadataOrCogniteExtractorFile, str]:
256
257
  if isinstance(file_meta, CogniteExtractorFileApply):
257
258
  node_id = self._apply_cognite_file(file_meta)
258
- file_meta, url = self._create_cdm(instance_id=node_id)
259
+ file_meta_response, url = self._create_cdm(instance_id=node_id)
259
260
  else:
260
- file_meta, url = self.cdf_client.files.create(file_metadata=file_meta, overwrite=self.overwrite_existing)
261
+ file_meta_response, url = self.cdf_client.files.create(
262
+ file_metadata=file_meta, overwrite=self.overwrite_existing
263
+ )
261
264
  # trigger update after creation (upsert =P)
262
265
  basic_attributes = set(["externalId", "name"])
263
266
  attr = set(file_meta.dump().keys())
264
267
  diff = attr - basic_attributes
265
268
 
266
269
  if len(diff) >= 1 and "externalId" in attr:
267
- file_meta = self.cdf_client.files.update(file_meta)
270
+ file_meta_response = self.cdf_client.files.update(file_meta)
268
271
 
269
- return file_meta, url
272
+ return file_meta_response, url
270
273
 
271
274
  def _upload_bytes(self, size: int, file: BinaryIO, file_meta: FileMetadataOrCogniteExtractorFile) -> None:
272
275
  file_meta, url = self._upload_empty(file_meta)
@@ -406,12 +409,22 @@ class IOFileUploadQueue(AbstractUploadQueue):
406
409
  self, url_str: str, stream: BinaryIO, size: int, mime_type: Optional[str] = None
407
410
  ) -> Request:
408
411
  url = URL(url_str)
412
+ base_url = URL(self.cdf_client.config.base_url)
413
+
414
+ if url.host == base_url.host:
415
+ upload_url = url
416
+ else:
417
+ parsed_url: ParseResult = urlparse(url_str)
418
+ parsed_base_url: ParseResult = urlparse(self.cdf_client.config.base_url)
419
+ replaced_upload_url = parsed_url._replace(netloc=parsed_base_url.netloc).geturl()
420
+ upload_url = URL(replaced_upload_url)
421
+
409
422
  headers = Headers(self._httpx_client.headers)
410
423
  headers.update(
411
424
  {
412
425
  "Accept": "*/*",
413
426
  "Content-Length": str(size),
414
- "Host": url.netloc.decode("ascii"),
427
+ "Host": upload_url.netloc.decode("ascii"),
415
428
  "x-cdp-app": self.cdf_client._config.client_name,
416
429
  }
417
430
  )
@@ -421,7 +434,7 @@ class IOFileUploadQueue(AbstractUploadQueue):
421
434
 
422
435
  return Request(
423
436
  method="PUT",
424
- url=url,
437
+ url=upload_url,
425
438
  stream=IOByteStream(stream),
426
439
  headers=headers,
427
440
  )
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cognite-extractor-utils"
3
- version = "7.4.6"
3
+ version = "7.4.8"
4
4
  description = "Utilities for easier development of extractors for CDF"
5
5
  authors = ["Mathias Lohne <mathias.lohne@cognite.com>"]
6
6
  license = "Apache-2.0"
@@ -74,6 +74,7 @@ orjson = "^3.10.3"
74
74
  httpx = "^0.27.0"
75
75
  pydantic = "^2.8.2"
76
76
  pyhumps = "^3.8.0"
77
+ croniter = "^3.0.3"
77
78
 
78
79
  [tool.poetry.extras]
79
80
  experimental = ["cognite-sdk-experimental"]
@@ -93,7 +94,7 @@ parameterized = "*"
93
94
  requests = "^2.31.0"
94
95
  types-requests = "^2.31.0.20240125"
95
96
  httpx = "^0.27.0"
96
- faker = "^28.0.0"
97
+ faker = "^30.0.0"
97
98
 
98
99
  [build-system]
99
100
  requires = ["poetry-core>=1.0.0"]