cognite-extractor-utils 7.5.13__py3-none-any.whl → 7.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-extractor-utils might be problematic. Click here for more details.

Files changed (47) hide show
  1. cognite/extractorutils/__init__.py +1 -1
  2. cognite/extractorutils/_inner_util.py +1 -1
  3. cognite/extractorutils/base.py +120 -40
  4. cognite/extractorutils/configtools/__init__.py +4 -5
  5. cognite/extractorutils/configtools/_util.py +3 -2
  6. cognite/extractorutils/configtools/elements.py +213 -35
  7. cognite/extractorutils/configtools/loaders.py +68 -16
  8. cognite/extractorutils/configtools/validators.py +5 -1
  9. cognite/extractorutils/exceptions.py +11 -2
  10. cognite/extractorutils/metrics.py +17 -12
  11. cognite/extractorutils/statestore/__init__.py +77 -3
  12. cognite/extractorutils/statestore/_base.py +7 -3
  13. cognite/extractorutils/statestore/hashing.py +129 -15
  14. cognite/extractorutils/statestore/watermark.py +77 -87
  15. cognite/extractorutils/threading.py +30 -4
  16. cognite/extractorutils/unstable/__init__.py +5 -5
  17. cognite/extractorutils/unstable/configuration/__init__.py +3 -0
  18. cognite/extractorutils/unstable/configuration/exceptions.py +13 -2
  19. cognite/extractorutils/unstable/configuration/loaders.py +90 -19
  20. cognite/extractorutils/unstable/configuration/models.py +121 -7
  21. cognite/extractorutils/unstable/core/__init__.py +5 -0
  22. cognite/extractorutils/unstable/core/_dto.py +5 -3
  23. cognite/extractorutils/unstable/core/base.py +113 -4
  24. cognite/extractorutils/unstable/core/errors.py +41 -0
  25. cognite/extractorutils/unstable/core/logger.py +149 -0
  26. cognite/extractorutils/unstable/core/restart_policy.py +16 -2
  27. cognite/extractorutils/unstable/core/runtime.py +119 -36
  28. cognite/extractorutils/unstable/core/tasks.py +53 -1
  29. cognite/extractorutils/unstable/scheduling/__init__.py +13 -0
  30. cognite/extractorutils/unstable/scheduling/_scheduler.py +1 -1
  31. cognite/extractorutils/uploader/__init__.py +7 -5
  32. cognite/extractorutils/uploader/_base.py +4 -5
  33. cognite/extractorutils/uploader/assets.py +13 -8
  34. cognite/extractorutils/uploader/data_modeling.py +37 -2
  35. cognite/extractorutils/uploader/events.py +14 -9
  36. cognite/extractorutils/uploader/files.py +80 -21
  37. cognite/extractorutils/uploader/raw.py +12 -7
  38. cognite/extractorutils/uploader/time_series.py +58 -49
  39. cognite/extractorutils/uploader/upload_failure_handler.py +35 -2
  40. cognite/extractorutils/uploader_extractor.py +29 -6
  41. cognite/extractorutils/uploader_types.py +15 -1
  42. cognite/extractorutils/util.py +76 -23
  43. {cognite_extractor_utils-7.5.13.dist-info → cognite_extractor_utils-7.6.0.dist-info}/METADATA +1 -1
  44. cognite_extractor_utils-7.6.0.dist-info/RECORD +50 -0
  45. cognite_extractor_utils-7.5.13.dist-info/RECORD +0 -50
  46. {cognite_extractor_utils-7.5.13.dist-info → cognite_extractor_utils-7.6.0.dist-info}/WHEEL +0 -0
  47. {cognite_extractor_utils-7.5.13.dist-info → cognite_extractor_utils-7.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,3 @@
1
+ """
2
+ New version of ``configtools`` based on pydantic instead of dataclasses.
3
+ """
@@ -1,10 +1,15 @@
1
+ """
2
+ Exceptions representing invalid configurations.
3
+ """
4
+
5
+
1
6
  class InvalidConfigError(Exception):
2
7
  """
3
- Exception thrown from ``load_yaml`` and ``load_yaml_dict`` if config file is invalid. This can be due to
8
+ Exception thrown from ``load_yaml`` and ``load_yaml_dict`` if config file is invalid. This can be due to.
4
9
 
5
10
  * Missing fields
6
11
  * Incompatible types
7
- * Unkown fields
12
+ * Unknown fields
8
13
  """
9
14
 
10
15
  def __init__(self, message: str, details: list[str] | None = None):
@@ -15,7 +20,13 @@ class InvalidConfigError(Exception):
15
20
  self.attempted_revision: int | None = None
16
21
 
17
22
  def __str__(self) -> str:
23
+ """
24
+ Underlying message prefixed with 'Invalid config:'.
25
+ """
18
26
  return f"Invalid config: {self.message}"
19
27
 
20
28
  def __repr__(self) -> str:
29
+ """
30
+ Underlying message prefixed with 'Invalid config:'.
31
+ """
21
32
  return self.__str__()
@@ -1,3 +1,7 @@
1
+ """
2
+ Module containing functions and classes for loading configuration files.
3
+ """
4
+
1
5
  import json
2
6
  from enum import Enum
3
7
  from io import StringIO
@@ -7,46 +11,89 @@ from typing import TextIO, TypeVar
7
11
  from pydantic import ValidationError
8
12
 
9
13
  from cognite.client import CogniteClient
14
+ from cognite.client.exceptions import CogniteAPIError
10
15
  from cognite.extractorutils.configtools.loaders import _load_yaml_dict_raw
11
16
  from cognite.extractorutils.exceptions import InvalidConfigError as OldInvalidConfigError
12
17
  from cognite.extractorutils.unstable.configuration.exceptions import InvalidConfigError
13
18
  from cognite.extractorutils.unstable.configuration.models import ConfigModel
14
19
 
15
- __all__ = ["ConfigFormat", "load_file", "load_from_cdf", "load_io", "load_dict"]
20
+ __all__ = ["ConfigFormat", "load_dict", "load_file", "load_from_cdf", "load_io"]
16
21
 
17
22
 
18
23
  _T = TypeVar("_T", bound=ConfigModel)
19
24
 
20
25
 
21
26
  class ConfigFormat(Enum):
27
+ """
28
+ Enumeration of supported configuration file formats.
29
+
30
+ Attributes:
31
+ JSON: Represents the JSON configuration file format.
32
+ YAML: Represents the YAML configuration file format.
33
+ """
34
+
22
35
  JSON = "json"
23
36
  YAML = "yaml"
24
37
 
25
38
 
26
39
  def load_file(path: Path, schema: type[_T]) -> _T:
40
+ """
41
+ Load a configuration file from the given path and parse it into the specified schema.
42
+
43
+ Args:
44
+ path: Path to the configuration file.
45
+ schema: The schema class to parse the configuration into.
46
+
47
+ Returns:
48
+ An instance of the schema populated with the configuration data.
49
+
50
+ Raises:
51
+ InvalidConfigError: If the file type is unknown or the configuration is invalid.
52
+ """
27
53
  if path.suffix in [".yaml", ".yml"]:
28
- format = ConfigFormat.YAML
54
+ file_format = ConfigFormat.YAML
29
55
  elif path.suffix == ".json":
30
- format = ConfigFormat.JSON
56
+ file_format = ConfigFormat.JSON
31
57
  else:
32
58
  raise InvalidConfigError(f"Unknown file type {path.suffix}")
33
59
 
34
60
  with open(path) as stream:
35
- return load_io(stream, format, schema)
61
+ return load_io(stream, file_format, schema)
36
62
 
37
63
 
38
64
  def load_from_cdf(
39
65
  cognite_client: CogniteClient, external_id: str, schema: type[_T], revision: int | None = None
40
66
  ) -> tuple[_T, int]:
67
+ """
68
+ Load a configuration from a CDF integration using the provided external ID and schema.
69
+
70
+ Args:
71
+ cognite_client: An instance of CogniteClient to interact with CDF.
72
+ external_id: The external ID of the integration to load configuration from.
73
+ schema: The schema class to parse the configuration into.
74
+ revision: the specific revision of the configuration to load, otherwise get the latest.
75
+
76
+ Returns:
77
+ A tuple containing the parsed configuration instance and the revision number.
78
+
79
+ Raises:
80
+ InvalidConfigError: If the configuration is invalid or not found.
81
+ CogniteAPIError: If there is an unexpected error communicating with CDF.
82
+ """
41
83
  params: dict[str, str | int] = {"integration": external_id}
42
84
  if revision:
43
85
  params["revision"] = revision
44
- response = cognite_client.get(
45
- f"/api/v1/projects/{cognite_client.config.project}/odin/config",
46
- params=params,
47
- headers={"cdf-version": "alpha"},
48
- )
49
- response.raise_for_status()
86
+ try:
87
+ response = cognite_client.get(
88
+ f"/api/v1/projects/{cognite_client.config.project}/odin/config",
89
+ params=params,
90
+ headers={"cdf-version": "alpha"},
91
+ )
92
+ except CogniteAPIError as e:
93
+ if e.code == 404:
94
+ raise InvalidConfigError("No configuration found for the given integration") from e
95
+ raise e
96
+
50
97
  data = response.json()
51
98
 
52
99
  try:
@@ -61,11 +108,25 @@ def load_from_cdf(
61
108
  raise new_e from e
62
109
 
63
110
 
64
- def load_io(stream: TextIO, format: ConfigFormat, schema: type[_T]) -> _T:
65
- if format == ConfigFormat.JSON:
111
+ def load_io(stream: TextIO, file_format: ConfigFormat, schema: type[_T]) -> _T:
112
+ """
113
+ Load a configuration from a stream (e.g., file or string) and parse it into the specified schema.
114
+
115
+ Args:
116
+ stream: A text stream containing the configuration data.
117
+ file_format: The format of the configuration data.
118
+ schema: The schema class to parse the configuration into.
119
+
120
+ Returns:
121
+ An instance of the schema populated with the configuration data.
122
+
123
+ Raises:
124
+ InvalidConfigError: If the file format is unknown or the configuration is invalid.
125
+ """
126
+ if file_format == ConfigFormat.JSON:
66
127
  data = json.load(stream)
67
128
 
68
- elif format == ConfigFormat.YAML:
129
+ elif file_format == ConfigFormat.YAML:
69
130
  data = _load_yaml_dict_raw(stream)
70
131
 
71
132
  if "azure-keyvault" in data:
@@ -89,15 +150,25 @@ def _make_loc_str(loc: tuple) -> str:
89
150
  loc_str = f"{loc_str}{lo}"
90
151
  needs_sep = True
91
152
  else:
92
- if isinstance(lo, int):
93
- loc_str = f"{loc_str}[{lo}]"
94
- else:
95
- loc_str = f"{loc_str}.{lo}"
153
+ loc_str = f"{loc_str}[{lo}]" if isinstance(lo, int) else f"{loc_str}.{lo}"
96
154
 
97
155
  return loc_str
98
156
 
99
157
 
100
158
  def load_dict(data: dict, schema: type[_T]) -> _T:
159
+ """
160
+ Load a configuration from a dictionary and parse it into the specified schema.
161
+
162
+ Args:
163
+ data: A dictionary containing the configuration data.
164
+ schema: The schema class to parse the configuration into.
165
+
166
+ Returns:
167
+ An instance of the schema populated with the configuration data.
168
+
169
+ Raises:
170
+ InvalidConfigError: If the configuration is invalid.
171
+ """
101
172
  try:
102
173
  return schema.model_validate(data)
103
174
 
@@ -113,8 +184,8 @@ def load_dict(data: dict, schema: type[_T]) -> _T:
113
184
 
114
185
  if "ctx" in err and "error" in err["ctx"]:
115
186
  exc = err["ctx"]["error"]
116
- if isinstance(exc, ValueError) or isinstance(exc, AssertionError):
117
- messages.append(f"{str(exc)}: {loc_str}")
187
+ if isinstance(exc, ValueError | AssertionError):
188
+ messages.append(f"{exc!s}: {loc_str}")
118
189
  continue
119
190
 
120
191
  messages.append(f"{err.get('msg')}: {loc_str}")
@@ -1,3 +1,7 @@
1
+ """
2
+ Module containing pre-built models for common extractor configuration.
3
+ """
4
+
1
5
  import os
2
6
  import re
3
7
  from datetime import timedelta
@@ -21,22 +25,26 @@ from cognite.extractorutils.configtools._util import _load_certificate_data
21
25
  from cognite.extractorutils.exceptions import InvalidConfigError
22
26
 
23
27
  __all__ = [
24
- "ConfigModel",
25
28
  "AuthenticationConfig",
26
- "TimeIntervalConfig",
29
+ "ConfigModel",
27
30
  "ConnectionConfig",
28
31
  "CronConfig",
32
+ "ExtractorConfig",
29
33
  "IntervalConfig",
30
- "ScheduleConfig",
31
- "LogLevel",
32
- "LogFileHandlerConfig",
33
34
  "LogConsoleHandlerConfig",
35
+ "LogFileHandlerConfig",
34
36
  "LogHandlerConfig",
35
- "ExtractorConfig",
37
+ "LogLevel",
38
+ "ScheduleConfig",
39
+ "TimeIntervalConfig",
36
40
  ]
37
41
 
38
42
 
39
43
  class ConfigModel(BaseModel):
44
+ """
45
+ Base model for configuration objects, setting the correct pydantic options for extractor config.
46
+ """
47
+
40
48
  model_config = ConfigDict(
41
49
  alias_generator=kebabize,
42
50
  populate_by_name=True,
@@ -69,7 +77,7 @@ AuthenticationConfig = Annotated[_ClientCredentialsConfig | _ClientCertificateCo
69
77
 
70
78
  class TimeIntervalConfig:
71
79
  """
72
- Configuration parameter for setting a time interval
80
+ Configuration parameter for setting a time interval.
73
81
  """
74
82
 
75
83
  def __init__(self, expression: str) -> None:
@@ -77,14 +85,25 @@ class TimeIntervalConfig:
77
85
 
78
86
  @classmethod
79
87
  def __get_pydantic_core_schema__(cls, source_type: Any, handler: GetCoreSchemaHandler) -> CoreSchema:
88
+ """
89
+ Pydantic hook to define how this class should be serialized/deserialized.
90
+
91
+ This allows the class to be used as a field in Pydantic models.
92
+ """
80
93
  return core_schema.no_info_after_validator_function(cls, handler(str | int))
81
94
 
82
95
  def __eq__(self, other: object) -> bool:
96
+ """
97
+ Two TimeIntervalConfig objects are equal if they have the same number of seconds in their interval.
98
+ """
83
99
  if not isinstance(other, TimeIntervalConfig):
84
100
  return NotImplemented
85
101
  return self._interval == other._interval
86
102
 
87
103
  def __hash__(self) -> int:
104
+ """
105
+ Hash function for TimeIntervalConfig based on the number of seconds in the interval.
106
+ """
88
107
  return hash(self._interval)
89
108
 
90
109
  @classmethod
@@ -106,36 +125,69 @@ class TimeIntervalConfig:
106
125
 
107
126
  @property
108
127
  def seconds(self) -> int:
128
+ """
129
+ Time interval as number of seconds.
130
+ """
109
131
  return self._interval
110
132
 
111
133
  @property
112
134
  def minutes(self) -> float:
135
+ """
136
+ Time interval as number of minutes.
137
+
138
+ This is a float since the underlying interval is in seconds.
139
+ """
113
140
  return self._interval / 60
114
141
 
115
142
  @property
116
143
  def hours(self) -> float:
144
+ """
145
+ Time interval as number of hours.
146
+
147
+ This is a float since the underlying interval is in seconds.
148
+ """
117
149
  return self._interval / (60 * 60)
118
150
 
119
151
  @property
120
152
  def days(self) -> float:
153
+ """
154
+ Time interval as number of days.
155
+
156
+ This is a float since the underlying interval is in seconds.
157
+ """
121
158
  return self._interval / (60 * 60 * 24)
122
159
 
123
160
  @property
124
161
  def timedelta(self) -> timedelta:
162
+ """
163
+ Time interval as a timedelta object.
164
+ """
125
165
  days = self._interval // (60 * 60 * 24)
126
166
  seconds = self._interval % (60 * 60 * 24)
127
167
  return timedelta(days=days, seconds=seconds)
128
168
 
129
169
  def __int__(self) -> int:
170
+ """
171
+ Returns the time interval as a number of seconds.
172
+ """
130
173
  return int(self._interval)
131
174
 
132
175
  def __float__(self) -> float:
176
+ """
177
+ Returns the time interval as a number of seconds.
178
+ """
133
179
  return float(self._interval)
134
180
 
135
181
  def __str__(self) -> str:
182
+ """
183
+ Returns the time interval as a human readable string.
184
+ """
136
185
  return self._expression
137
186
 
138
187
  def __repr__(self) -> str:
188
+ """
189
+ Returns the time interval as a human readable string.
190
+ """
139
191
  return self._expression
140
192
 
141
193
 
@@ -152,6 +204,15 @@ class _ConnectionParameters(ConfigModel):
152
204
 
153
205
 
154
206
  class ConnectionConfig(ConfigModel):
207
+ """
208
+ Configuration for connecting to a Cognite Data Fusion project.
209
+
210
+ This configuration includes the project name, base URL, integration name, and authentication details, as well as
211
+ optional connection parameters.
212
+
213
+ This configuration is common for all extractors.
214
+ """
215
+
155
216
  project: str
156
217
  base_url: str
157
218
 
@@ -162,6 +223,15 @@ class ConnectionConfig(ConfigModel):
162
223
  connection: _ConnectionParameters = Field(default_factory=_ConnectionParameters)
163
224
 
164
225
  def get_cognite_client(self, client_name: str) -> CogniteClient:
226
+ """
227
+ Create a CogniteClient instance using the configuration parameters.
228
+
229
+ Args:
230
+ client_name: Name of the client, set as the x-cdp-app header in the requests
231
+
232
+ Returns:
233
+ CogniteClient: An instance of CogniteClient configured with the provided parameters.
234
+ """
165
235
  from cognite.client.config import global_config
166
236
 
167
237
  global_config.disable_pypi_version_check = True
@@ -218,6 +288,26 @@ class ConnectionConfig(ConfigModel):
218
288
 
219
289
  @classmethod
220
290
  def from_environment(cls) -> "ConnectionConfig":
291
+ """
292
+ Create a ConnectionConfig instance from environment variables.
293
+
294
+ Environment variables should be set as follows:
295
+ - COGNITE_PROJECT: The name of the Cognite Data Fusion project.
296
+ - COGNITE_BASE_URL: The base URL of the Cognite Data Fusion instance.
297
+ - COGNITE_INTEGRATION: The external ID of the corresponding integration in CDF.
298
+ - COGNITE_CLIENT_ID: The client ID for authentication.
299
+ - COGNITE_TOKEN_SCOPES: The scopes for the token.
300
+ - COGNITE_CLIENT_SECRET: The client secret for authentication (if using client credentials).
301
+ - COGNITE_TOKEN_URL: The token URL for authentication (if using client credentials).
302
+ - COGNITE_CLIENT_CERTIFICATE_PATH: The path to the client certificate (if using client certificate).
303
+ - COGNITE_AUTHORITY_URL: The authority URL for authentication (if using client certificate).
304
+
305
+ Returns:
306
+ ConnectionConfig: An instance of ConnectionConfig populated with the environment variables.
307
+
308
+ Raises:
309
+ KeyError: If any of the required environment variables are missing.
310
+ """
221
311
  auth: AuthenticationConfig
222
312
  if "COGNITE_CLIENT_SECRET" in os.environ:
223
313
  auth = _ClientCredentialsConfig(
@@ -248,11 +338,19 @@ class ConnectionConfig(ConfigModel):
248
338
 
249
339
 
250
340
  class CronConfig(ConfigModel):
341
+ """
342
+ Configuration parameter for setting a cron schedule.
343
+ """
344
+
251
345
  type: Literal["cron"]
252
346
  expression: str
253
347
 
254
348
 
255
349
  class IntervalConfig(ConfigModel):
350
+ """
351
+ Configuration parameter for setting an interval schedule.
352
+ """
353
+
256
354
  type: Literal["interval"]
257
355
  expression: TimeIntervalConfig
258
356
 
@@ -261,6 +359,10 @@ ScheduleConfig = Annotated[CronConfig | IntervalConfig, Field(discriminator="typ
261
359
 
262
360
 
263
361
  class LogLevel(Enum):
362
+ """
363
+ Enumeration of log levels for the extractor.
364
+ """
365
+
264
366
  CRITICAL = "CRITICAL"
265
367
  ERROR = "ERROR"
266
368
  WARNING = "WARNING"
@@ -269,6 +371,10 @@ class LogLevel(Enum):
269
371
 
270
372
 
271
373
  class LogFileHandlerConfig(ConfigModel):
374
+ """
375
+ Configuration for a log handler that writes to a file, with daily rotation.
376
+ """
377
+
272
378
  type: Literal["file"]
273
379
  path: Path
274
380
  level: LogLevel
@@ -276,6 +382,10 @@ class LogFileHandlerConfig(ConfigModel):
276
382
 
277
383
 
278
384
  class LogConsoleHandlerConfig(ConfigModel):
385
+ """
386
+ Configuration for a log handler that writes to standard output.
387
+ """
388
+
279
389
  type: Literal["console"]
280
390
  level: LogLevel
281
391
 
@@ -289,4 +399,8 @@ def _log_handler_default() -> list[LogHandlerConfig]:
289
399
 
290
400
 
291
401
  class ExtractorConfig(ConfigModel):
402
+ """
403
+ Base class for application configuration for extractors.
404
+ """
405
+
292
406
  log_handlers: list[LogHandlerConfig] = Field(default_factory=_log_handler_default)
@@ -0,0 +1,5 @@
1
+ """
2
+ The ``core`` package contains the core functionality for defining and managing extractors.
3
+
4
+ It contains the base class for extractors, the runtime for running extractors, and classes for tasks and errors.
5
+ """
@@ -1,5 +1,5 @@
1
1
  """
2
- Temporary holding place for DTOs against Extraction Pipelines 2.0 until it's in the SDK
2
+ Temporary holding place for DTOs against Extraction Pipelines 2.0 until it's in the SDK.
3
3
  """
4
4
 
5
5
  from typing import Any, Literal
@@ -10,9 +10,11 @@ from pydantic import BaseModel, ConfigDict
10
10
 
11
11
  class CogniteModel(BaseModel):
12
12
  """
13
- Base class for DTO classes based on pydantic, but with a few tweaks to make it inline with the CDF API guidelines:
13
+ Base class for DTO classes based on pydantic.
14
+
15
+ With a few tweaks to make it inline with the CDF API guidelines:
14
16
  * camelCase instead of snake_case when serializing/deserializing into/from JSON
15
- * exclude Nones from serialized JSON instead of having nulls in the response text
17
+ * exclude Nones from serialized JSON instead of having nulls in the response text.
16
18
  """
17
19
 
18
20
  def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]: