cognite-extractor-utils 7.2.2__tar.gz → 7.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cognite-extractor-utils might be problematic. Click here for more details.

Files changed (31) hide show
  1. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/PKG-INFO +2 -1
  2. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/__init__.py +1 -1
  3. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/configtools/elements.py +35 -6
  4. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader/files.py +63 -37
  5. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/pyproject.toml +5 -4
  6. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/LICENSE +0 -0
  7. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/README.md +0 -0
  8. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/_inner_util.py +0 -0
  9. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/base.py +0 -0
  10. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/configtools/__init__.py +0 -0
  11. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/configtools/_util.py +0 -0
  12. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/configtools/loaders.py +0 -0
  13. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/exceptions.py +0 -0
  14. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/metrics.py +0 -0
  15. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/py.typed +0 -0
  16. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/statestore/__init__.py +0 -0
  17. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/statestore/_base.py +0 -0
  18. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/statestore/hashing.py +0 -0
  19. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/statestore/watermark.py +0 -0
  20. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/threading.py +0 -0
  21. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader/__init__.py +0 -0
  22. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader/_base.py +0 -0
  23. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader/_metrics.py +0 -0
  24. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader/assets.py +0 -0
  25. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader/data_modeling.py +0 -0
  26. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader/events.py +0 -0
  27. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader/raw.py +0 -0
  28. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader/time_series.py +0 -0
  29. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader_extractor.py +0 -0
  30. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/uploader_types.py +0 -0
  31. {cognite_extractor_utils-7.2.2 → cognite_extractor_utils-7.3.0}/cognite/extractorutils/util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cognite-extractor-utils
3
- Version: 7.2.2
3
+ Version: 7.3.0
4
4
  Summary: Utilities for easier development of extractors for CDF
5
5
  Home-page: https://github.com/cognitedata/python-extractor-utils
6
6
  License: Apache-2.0
@@ -21,6 +21,7 @@ Requires-Dist: azure-keyvault-secrets (>=4.7.0,<5.0.0)
21
21
  Requires-Dist: cognite-sdk (>=7.43.3,<8.0.0)
22
22
  Requires-Dist: dacite (>=1.6.0,<2.0.0)
23
23
  Requires-Dist: decorator (>=5.1.1,<6.0.0)
24
+ Requires-Dist: httpx (>=0.27.0,<0.28.0)
24
25
  Requires-Dist: more-itertools (>=10.0.0,<11.0.0)
25
26
  Requires-Dist: orjson (>=3.10.3,<4.0.0)
26
27
  Requires-Dist: prometheus-client (>0.7.0,<=1.0.0)
@@ -16,5 +16,5 @@
16
16
  Cognite extractor utils is a Python package that simplifies the development of new extractors.
17
17
  """
18
18
 
19
- __version__ = "7.2.2"
19
+ __version__ = "7.3.0"
20
20
  from .base import Extractor
@@ -20,7 +20,7 @@ from enum import Enum
20
20
  from logging.handlers import TimedRotatingFileHandler
21
21
  from time import sleep
22
22
  from typing import Any, Dict, List, Optional, Tuple, Union
23
- from urllib.parse import urljoin
23
+ from urllib.parse import urljoin, urlparse
24
24
 
25
25
  import yaml
26
26
  from prometheus_client import REGISTRY, start_http_server
@@ -70,7 +70,7 @@ class AuthenticatorConfig:
70
70
 
71
71
  client_id: str
72
72
  scopes: List[str]
73
- secret: Optional[str]
73
+ secret: Optional[str] = None
74
74
  tenant: Optional[str] = None
75
75
  token_url: Optional[str] = None
76
76
  resource: Optional[str] = None
@@ -264,6 +264,18 @@ class FileSizeConfig(yaml.YAMLObject):
264
264
  return self._expression
265
265
 
266
266
 
267
+ path_elem_regex = re.compile(r"^([a-zA-Z0-9.\-_~!$&'()*+,;=:@]|%[A-F0-9]{2})*$")
268
+
269
+
270
+ def _validate_https_url(value: str, name: str) -> None:
271
+ try:
272
+ url = urlparse(value)
273
+ except Exception as e:
274
+ raise InvalidConfigError(f"{name} ({value}) is not a valid URL") from e
275
+ if url.scheme != "https":
276
+ raise InvalidConfigError(f"{name} ({value}) must be HTTPS")
277
+
278
+
267
279
  @dataclass
268
280
  class CogniteConfig:
269
281
  """
@@ -272,10 +284,10 @@ class CogniteConfig:
272
284
 
273
285
  project: str
274
286
  idp_authentication: AuthenticatorConfig
275
- data_set: Optional[EitherIdConfig]
276
- data_set_id: Optional[int]
277
- data_set_external_id: Optional[str]
278
- extraction_pipeline: Optional[EitherIdConfig]
287
+ data_set: Optional[EitherIdConfig] = None
288
+ data_set_id: Optional[int] = None
289
+ data_set_external_id: Optional[str] = None
290
+ extraction_pipeline: Optional[EitherIdConfig] = None
279
291
  timeout: TimeIntervalConfig = TimeIntervalConfig("30s")
280
292
  connection: ConnectionConfig = field(default_factory=ConnectionConfig)
281
293
  security_categories: Optional[List[int]] = None
@@ -300,11 +312,20 @@ class CogniteConfig:
300
312
  global_config.disable_ssl = self.connection.disable_ssl
301
313
  global_config.proxies = self.connection.proxies
302
314
 
315
+ if not self.project:
316
+ raise InvalidConfigError("Project is not set")
317
+ if not path_elem_regex.match(self.project):
318
+ raise InvalidConfigError(f"Project ({self.project}) is not valid")
319
+
303
320
  credential_provider: CredentialProvider
304
321
  if self.idp_authentication.certificate:
305
322
  if self.idp_authentication.certificate.authority_url:
306
323
  authority_url = self.idp_authentication.certificate.authority_url
324
+ _validate_https_url(self.idp_authentication.certificate.authority_url, "Authority URL")
307
325
  elif self.idp_authentication.tenant:
326
+ _validate_https_url(self.idp_authentication.authority, "Authority")
327
+ if not path_elem_regex.match(self.idp_authentication.tenant):
328
+ raise InvalidConfigError(f"Tenant {self.idp_authentication.tenant} is not valid")
308
329
  authority_url = urljoin(self.idp_authentication.authority, self.idp_authentication.tenant)
309
330
  else:
310
331
  raise InvalidConfigError("Either authority-url or tenant is required for certificate authentication")
@@ -312,6 +333,8 @@ class CogniteConfig:
312
333
  self.idp_authentication.certificate.path,
313
334
  self.idp_authentication.certificate.password,
314
335
  )
336
+ if not self.idp_authentication.scopes:
337
+ _logger.warn("No scopes configured. Authenticating with CDF is unlikely to work correctly")
315
338
  credential_provider = OAuthClientCertificate(
316
339
  authority_url=authority_url,
317
340
  client_id=self.idp_authentication.client_id,
@@ -323,14 +346,20 @@ class CogniteConfig:
323
346
  elif self.idp_authentication.secret:
324
347
  kwargs: Dict[str, Any] = {}
325
348
  if self.idp_authentication.token_url:
349
+ _validate_https_url(self.idp_authentication.token_url, "Token URL")
326
350
  kwargs["token_url"] = self.idp_authentication.token_url
327
351
  elif self.idp_authentication.tenant:
352
+ _validate_https_url(self.idp_authentication.authority, "Authority")
353
+ if not path_elem_regex.match(self.idp_authentication.tenant):
354
+ raise InvalidConfigError(f"Tenant ({self.idp_authentication.tenant}) is not valid")
328
355
  base_url = urljoin(self.idp_authentication.authority, self.idp_authentication.tenant)
329
356
  kwargs["token_url"] = f"{base_url}/oauth2/v2.0/token"
330
357
  else:
331
358
  raise InvalidConfigError("Either token-url or tenant is required for client credentials authentication")
332
359
  kwargs["client_id"] = self.idp_authentication.client_id
333
360
  kwargs["client_secret"] = self.idp_authentication.secret
361
+ if not self.idp_authentication.scopes:
362
+ _logger.warning("No scopes configured. Authenticating with CDF is unlikely to work correctly")
334
363
  kwargs["scopes"] = self.idp_authentication.scopes
335
364
  if token_custom_args is None:
336
365
  token_custom_args = {}
@@ -18,8 +18,9 @@ from io import BytesIO, RawIOBase
18
18
  from math import ceil
19
19
  from os import PathLike
20
20
  from types import TracebackType
21
- from typing import Any, BinaryIO, Callable, Dict, List, Optional, Tuple, Type, Union
21
+ from typing import Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union
22
22
 
23
+ from httpx import URL, Client, Headers, Request, StreamConsumed, SyncByteStream
23
24
  from requests.utils import super_len
24
25
 
25
26
  from cognite.client import CogniteClient
@@ -140,6 +141,22 @@ class ChunkedStream(RawIOBase, BinaryIO):
140
141
  return True
141
142
 
142
143
 
144
+ class IOByteStream(SyncByteStream):
145
+ CHUNK_SIZE = 65_536
146
+
147
+ def __init__(self, stream: BinaryIO) -> None:
148
+ self._stream = stream
149
+ self._is_stream_consumed = False
150
+
151
+ def __iter__(self) -> Iterator[bytes]:
152
+ if self._is_stream_consumed:
153
+ raise StreamConsumed()
154
+ chunk = self._stream.read(self.CHUNK_SIZE)
155
+ while chunk:
156
+ yield chunk
157
+ chunk = self._stream.read(self.CHUNK_SIZE)
158
+
159
+
143
160
  class IOFileUploadQueue(AbstractUploadQueue):
144
161
  """
145
162
  Upload queue for files using BinaryIO
@@ -205,6 +222,8 @@ class IOFileUploadQueue(AbstractUploadQueue):
205
222
 
206
223
  self._full_queue = threading.Condition()
207
224
 
225
+ self._httpx_client = Client(follow_redirects=True)
226
+
208
227
  global _QUEUES, _QUEUES_LOCK
209
228
  with _QUEUES_LOCK:
210
229
  self._pool = ThreadPoolExecutor(
@@ -266,44 +285,32 @@ class IOFileUploadQueue(AbstractUploadQueue):
266
285
  f"File {file_meta.external_id} is larger than 5GiB ({size})"
267
286
  f", uploading in {chunks.chunk_count} chunks"
268
287
  )
269
- with self.cdf_client.files.multipart_upload_session(
270
- file_meta.name if file_meta.name is not None else "",
271
- parts=chunks.chunk_count,
272
- overwrite=self.overwrite_existing,
273
- external_id=file_meta.external_id,
274
- source=file_meta.source,
275
- mime_type=file_meta.mime_type,
276
- metadata=file_meta.metadata,
277
- directory=file_meta.directory,
278
- asset_ids=file_meta.asset_ids,
279
- data_set_id=file_meta.data_set_id,
280
- labels=file_meta.labels,
281
- geo_location=file_meta.geo_location,
282
- source_created_time=file_meta.source_created_time,
283
- source_modified_time=file_meta.source_modified_time,
284
- security_categories=file_meta.security_categories,
285
- ) as session:
286
- while chunks.next_chunk():
287
- session.upload_part(chunks.current_chunk, chunks)
288
- file_meta = session.file_metadata
288
+
289
+ res = self.cdf_client.files._post(
290
+ url_path="/files/initmultipartupload",
291
+ json=file_meta.dump(camel_case=True),
292
+ params={"overwrite": self.overwrite_existing, "parts": chunks.chunk_count},
293
+ )
294
+ returned_file_metadata = res.json()
295
+ upload_urls = returned_file_metadata["uploadUrls"]
296
+ upload_id = returned_file_metadata["uploadId"]
297
+ file_meta = FileMetadata.load(returned_file_metadata)
298
+
299
+ for url in upload_urls:
300
+ chunks.next_chunk()
301
+ resp = self._httpx_client.send(self._get_file_upload_request(url, chunks, len(chunks)))
302
+ resp.raise_for_status()
303
+
304
+ self.cdf_client.files._post(
305
+ url_path="/files/completemultipartupload", json={"id": file_meta.id, "uploadId": upload_id}
306
+ )
307
+
289
308
  else:
290
- file_meta = self.cdf_client.files.upload_bytes(
291
- file,
292
- file_meta.name if file_meta.name is not None else "",
293
- overwrite=self.overwrite_existing,
294
- external_id=file_meta.external_id,
295
- source=file_meta.source,
296
- mime_type=file_meta.mime_type,
297
- metadata=file_meta.metadata,
298
- directory=file_meta.directory,
299
- asset_ids=file_meta.asset_ids,
300
- data_set_id=file_meta.data_set_id,
301
- labels=file_meta.labels,
302
- geo_location=file_meta.geo_location,
303
- source_created_time=file_meta.source_created_time,
304
- source_modified_time=file_meta.source_modified_time,
305
- security_categories=file_meta.security_categories,
309
+ file_meta, url = self.cdf_client.files.create(
310
+ file_metadata=file_meta, overwrite=self.overwrite_existing
306
311
  )
312
+ resp = self._httpx_client.send(self._get_file_upload_request(url, file, size))
313
+ resp.raise_for_status()
307
314
 
308
315
  if self.post_upload_function:
309
316
  try:
@@ -338,6 +345,25 @@ class IOFileUploadQueue(AbstractUploadQueue):
338
345
  self.files_queued.inc()
339
346
  self.queue_size.set(self.upload_queue_size)
340
347
 
348
+ def _get_file_upload_request(self, url_str: str, stream: BinaryIO, size: int) -> Request:
349
+ url = URL(url_str)
350
+ headers = Headers(self._httpx_client.headers)
351
+ headers.update(
352
+ {
353
+ "Accept": "*/*",
354
+ "Content-Length": str(size),
355
+ "Host": url.netloc.decode("ascii"),
356
+ "x-cdp-app": self.cdf_client._config.client_name,
357
+ }
358
+ )
359
+
360
+ return Request(
361
+ method="PUT",
362
+ url=url,
363
+ stream=IOByteStream(stream),
364
+ headers=headers,
365
+ )
366
+
341
367
  def upload(self, fail_on_errors: bool = True, timeout: Optional[float] = None) -> None:
342
368
  """
343
369
  Wait for all uploads to finish
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cognite-extractor-utils"
3
- version = "7.2.2"
3
+ version = "7.3.0"
4
4
  description = "Utilities for easier development of extractors for CDF"
5
5
  authors = ["Mathias Lohne <mathias.lohne@cognite.com>"]
6
6
  license = "Apache-2.0"
@@ -66,13 +66,14 @@ python-dotenv = "^1.0.0"
66
66
  azure-identity = "^1.14.0"
67
67
  azure-keyvault-secrets = "^4.7.0"
68
68
  orjson = "^3.10.3"
69
+ httpx = "^0.27.0"
69
70
 
70
71
  [tool.poetry.extras]
71
72
  experimental = ["cognite-sdk-experimental"]
72
73
 
73
74
  [tool.poetry.group.dev.dependencies]
74
- mypy = "1.10.1"
75
- ruff = "^0.4.0"
75
+ mypy = "1.11.1"
76
+ ruff = "^0.5.0"
76
77
  pytest = "^8.0.0"
77
78
  pytest-cov = "^5.0.0"
78
79
  sphinx = "^7.0.0"
@@ -85,7 +86,7 @@ parameterized = "*"
85
86
  requests = "^2.31.0"
86
87
  types-requests = "^2.31.0.20240125"
87
88
  httpx = "^0.27.0"
88
- faker = "^25.2.0"
89
+ faker = "^26.0.0"
89
90
 
90
91
  [build-system]
91
92
  requires = ["poetry-core>=1.0.0"]