PyPI - cognite-toolkit - Versions diffs - 0.6.88__py3-none-any.whl → 0.6.90__py3-none-any.whl - Mend

cognite-toolkit 0.6.88py3-none-any.whl → 0.6.90py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cognite-toolkit might be problematic. Click here for more details.

Files changed (29) hide show

cognite_toolkit/_cdf_tk/data_classes/_build_variables.py CHANGED Viewed

@@ -8,11 +8,11 @@ from functools import cached_property
 from pathlib import Path
 from typing import Any, Literal, SupportsIndex, overload
+from cognite_toolkit._cdf_tk.cruds._resource_cruds.transformation import TransformationCRUD
+from cognite_toolkit._cdf_tk.data_classes._module_directories import ModuleLocation
 from cognite_toolkit._cdf_tk.exceptions import ToolkitValueError
 from cognite_toolkit._cdf_tk.feature_flags import Flags
-from ._module_directories import ModuleLocation
 if sys.version_info >= (3, 11):
     from typing import Self
 else:
@@ -161,16 +161,19 @@ class BuildVariables(tuple, Sequence[BuildVariable]):
         ]
     @overload
-    def replace(self, content: str, file_suffix: str = ".yaml", use_placeholder: Literal[False] = False) -> str: ...
+    def replace(self, content: str, file_path: Path | None = None, use_placeholder: Literal[False] = False) -> str: ...
     @overload
     def replace(
-        self, content: str, file_suffix: str = ".yaml", use_placeholder: Literal[True] = True
+        self, content: str, file_path: Path | None = None, use_placeholder: Literal[True] = True
     ) -> tuple[str, dict[str, BuildVariable]]: ...
     def replace(
-        self, content: str, file_suffix: str = ".yaml", use_placeholder: bool = False
+        self, content: str, file_path: Path | None = None, use_placeholder: bool = False
     ) -> str | tuple[str, dict[str, BuildVariable]]:
+        # Extract file suffix from path, default to .yaml if not provided
+        file_suffix = file_path.suffix if file_path and file_path.suffix else ".yaml"
         variable_by_placeholder: dict[str, BuildVariable] = {}
         for variable in self:
             if not use_placeholder:
@@ -180,22 +183,125 @@ class BuildVariables(tuple, Sequence[BuildVariable]):
                 variable_by_placeholder[replace] = variable
             _core_pattern = rf"{{{{\s*{variable.key}\s*}}}}"
-            if file_suffix in {".yaml", ".yml", ".json"}:
-                # Preserve data types
-                pattern = _core_pattern
-                if isinstance(replace, str) and (replace.isdigit() or replace.endswith(":")):
-                    replace = f'"{replace}"'
-                    pattern = rf"'{_core_pattern}'|{_core_pattern}|" + rf'"{_core_pattern}"'
-                elif replace is None:
-                    replace = "null"
-                content = re.sub(pattern, str(replace), content)
+            if file_suffix == ".sql":
+                # For SQL files, convert lists to SQL-style tuples
+                if isinstance(replace, list):
+                    replace = self._format_list_as_sql_tuple(replace)
+                content = re.sub(_core_pattern, str(replace), content)
+            elif file_suffix in {".yaml", ".yml", ".json"}:
+                # Check if this is a transformation file (ends with Transformation.yaml/yml)
+                is_transformation_file = file_path is not None and f".{TransformationCRUD.kind}." in file_path.name
+                # Check if variable is within a query field (SQL context)
+                is_in_query_field = self._is_in_query_field(content, variable.key)
+                # For lists in query fields, use SQL-style tuples
+                # For transformation files, ensure SQL conversion is applied to query property variables
+                if is_transformation_file and is_in_query_field and isinstance(replace, list):
+                    replace = self._format_list_as_sql_tuple(replace)
+                    # Use simple pattern for SQL context (no YAML quoting needed)
+                    content = re.sub(_core_pattern, str(replace), content)
+                else:
+                    # Preserve data types for YAML
+                    pattern = _core_pattern
+                    if isinstance(replace, str) and (replace.isdigit() or replace.endswith(":")):
+                        replace = f'"{replace}"'
+                        pattern = rf"'{_core_pattern}'|{_core_pattern}|" + rf'"{_core_pattern}"'
+                    elif replace is None:
+                        replace = "null"
+                    content = re.sub(pattern, str(replace), content)
             else:
+                # For other file types, use simple string replacement
                 content = re.sub(_core_pattern, str(replace), content)
         if use_placeholder:
             return content, variable_by_placeholder
         else:
             return content
+    @staticmethod
+    def _is_transformation_file(file_path: Path) -> bool:
+        """Check if the file path indicates a transformation YAML file.
+        Transformation files are YAML files in the "transformations" folder.
+        Args:
+            file_path: The file path to check
+        Returns:
+            True if the file is a transformation YAML file
+        """
+        # Check if path contains "transformations" folder and ends with .yaml/.yml
+        path_str = file_path.as_posix().lower()
+        return "transformations" in path_str and file_path.suffix.lower() in {".yaml", ".yml"}
+    @staticmethod
+    def _format_list_as_sql_tuple(replace: list[Any]) -> str:
+        """Format a list as a SQL-style tuple string.
+        Args:
+            replace: The list to format
+        Returns:
+            SQL tuple string, e.g., "('A', 'B', 'C')" or "()" for empty lists
+        """
+        if not replace:
+            # Empty list becomes empty SQL tuple
+            return "()"
+        else:
+            # Format list as SQL tuple: ('A', 'B', 'C')
+            formatted_items = []
+            for item in replace:
+                if item is None:
+                    formatted_items.append("NULL")
+                elif isinstance(item, str):
+                    formatted_items.append(f"'{item}'")
+                else:
+                    formatted_items.append(str(item))
+            return f"({', '.join(formatted_items)})"
+    @staticmethod
+    def _is_in_query_field(content: str, variable_key: str) -> bool:
+        """Check if a variable is within a query field in YAML.
+        Assumes query is a top-level property. This detects various YAML formats:
+        - query: >-
+        - query: |
+        - query: "..."
+        - query: ...
+        """
+        lines = content.split("\n")
+        variable_pattern = rf"{{{{\s*{re.escape(variable_key)}\s*}}}}"
+        in_query_field = False
+        for line in lines:
+            # Check if this line starts a top-level query field
+            query_match = re.match(r"^query\s*:\s*(.*)$", line)
+            if query_match:
+                in_query_field = True
+                query_content_start = query_match.group(1).strip()
+                # Check if variable is on the same line as query: declaration
+                if re.search(variable_pattern, line):
+                    return True
+                # If query content starts on same line (not a block scalar), check it
+                if query_content_start and not query_content_start.startswith(("|", ">", "|-", ">-", "|+", ">+")):
+                    if re.search(variable_pattern, query_content_start):
+                        return True
+                continue
+            # Check if we're still in the query field
+            if in_query_field:
+                # If we hit another top-level property, we've exited the query field
+                if re.match(r"^\w+\s*:", line):
+                    in_query_field = False
+                    continue
+                # We're still in the query field, check for variable
+                if re.search(variable_pattern, line):
+                    return True
+        return False
     # Implemented to get correct type hints
     def __iter__(self) -> Iterator[BuildVariable]:
         return super().__iter__()

cognite_toolkit/_cdf_tk/data_classes/_built_resources.py CHANGED Viewed

@@ -158,7 +158,7 @@ class BuiltResourceFull(BuiltResource[T_ID]):
     def load_resource_dict(
         self, environment_variables: dict[str, str | None], validate: bool = False
     ) -> dict[str, Any]:
-        content = self.build_variables.replace(safe_read(self.source.path))
+        content = self.build_variables.replace(safe_read(self.source.path), self.source.path)
         loader = cast(ResourceCRUD, get_crud(self.resource_dir, self.kind))
         raw = load_yaml_inject_variables(
             content,

cognite_toolkit/_cdf_tk/resource_classes/agent.py CHANGED Viewed

@@ -55,3 +55,4 @@ class AgentYAML(ToolkitResource):
         "azure/gpt-4o-mini", description="The name of the model to use. Defaults to your CDF project's default model."
     )
     tools: list[AgentTool] | None = Field(None, description="A list of tools available to the agent.", max_length=20)
+    runtime_version: str | None = Field(None, description="The runtime version")

cognite_toolkit/_cdf_tk/resource_classes/infield_cdmv1.py ADDED Viewed

@@ -0,0 +1,92 @@
+from typing import Any
+from .base import BaseModelResource, ToolkitResource
+class ObservationFeatureToggles(BaseModelResource):
+    """Feature toggles for observations."""
+    is_enabled: bool | None = None
+    is_write_back_enabled: bool | None = None
+    notifications_endpoint_external_id: str | None = None
+    attachments_endpoint_external_id: str | None = None
+class FeatureToggles(BaseModelResource):
+    """Feature toggles for InField location configuration."""
+    three_d: bool | None = None
+    trends: bool | None = None
+    documents: bool | None = None
+    workorders: bool | None = None
+    notifications: bool | None = None
+    media: bool | None = None
+    template_checklist_flow: bool | None = None
+    workorder_checklist_flow: bool | None = None
+    observations: ObservationFeatureToggles | None = None
+class AccessManagement(BaseModelResource):
+    """Access management configuration."""
+    template_admins: list[str] | None = None  # list of CDF group external IDs
+    checklist_admins: list[str] | None = None  # list of CDF group external IDs
+class ResourceFilters(BaseModelResource):
+    """Resource filters."""
+    spaces: list[str] | None = None
+class RootLocationDataFilters(BaseModelResource):
+    """Data filters for root location."""
+    general: ResourceFilters | None = None
+    assets: ResourceFilters | None = None
+    files: ResourceFilters | None = None
+    timeseries: ResourceFilters | None = None
+class DataExplorationConfig(BaseModelResource):
+    """Properties for DataExplorationConfig node.
+    Contains configuration for data exploration features:
+    - observations: Observations feature configuration
+    - activities: Activities configuration
+    - documents: Document configuration
+    - notifications: Notifications configuration
+    - assets: Asset page configuration
+    """
+    external_id: str
+    observations: dict[str, Any] | None = None  # ObservationsConfigFeature
+    activities: dict[str, Any] | None = None  # ActivitiesConfiguration
+    documents: dict[str, Any] | None = None  # DocumentConfiguration
+    notifications: dict[str, Any] | None = None  # NotificationsConfiguration
+    assets: dict[str, Any] | None = None  # AssetPageConfiguration
+class InfieldLocationConfigYAML(ToolkitResource):
+    """Properties for InFieldLocationConfig node.
+    Currently migrated fields:
+    - root_location_external_id: Reference to the LocationFilterDTO external ID
+    - feature_toggles: Feature toggles migrated from old configuration
+    - rootAsset: Direct relation to the root asset (space and externalId)
+    - app_instance_space: Application instance space from appDataInstanceSpace
+    - access_management: Template and checklist admin groups (from templateAdmins and checklistAdmins)
+    - disciplines: List of disciplines (from disciplines in FeatureConfiguration)
+    - data_filters: Data filters for general, assets, files, and timeseries (from dataFilters in old configuration)
+    - data_exploration_config: Direct relation to the DataExplorationConfig node (shared across all locations)
+    """
+    external_id: str
+    root_location_external_id: str | None = None
+    feature_toggles: FeatureToggles | None = None
+    app_instance_space: str | None = None
+    access_management: AccessManagement | None = None
+    data_filters: RootLocationDataFilters | None = None
+    data_exploration_config: DataExplorationConfig | None = None

cognite_toolkit/_cdf_tk/storageio/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 from cognite_toolkit._cdf_tk.utils._auxiliary import get_concrete_subclasses
 from cognite_toolkit._cdf_tk.utils.fileio import COMPRESSION_BY_SUFFIX
+from ._annotations import FileAnnotationIO
 from ._applications import CanvasIO, ChartIO
 from ._asset_centric import AssetIO, BaseAssetCentricIO, EventIO, FileMetadataIO, HierarchyIO, TimeSeriesIO
 from ._base import (
@@ -50,6 +51,7 @@ __all__ = [
     "ChartIO",
     "ConfigurableStorageIO",
     "EventIO",
+    "FileAnnotationIO",
     "FileMetadataIO",
     "HierarchyIO",
     "InstanceIO",

cognite_toolkit/_cdf_tk/storageio/_annotations.py ADDED Viewed

@@ -0,0 +1,102 @@
+from collections.abc import Iterable, Sequence
+from typing import Any
+from cognite.client.data_classes import Annotation, AnnotationFilter
+from cognite_toolkit._cdf_tk.utils.collection import chunker_sequence
+from cognite_toolkit._cdf_tk.utils.useful_types import JsonVal
+from ._asset_centric import FileMetadataIO
+from ._base import Page, StorageIO
+from .selectors import AssetCentricSelector
+class FileAnnotationIO(StorageIO[AssetCentricSelector, Annotation]):
+    SUPPORTED_DOWNLOAD_FORMATS = frozenset({".ndjson"})
+    SUPPORTED_COMPRESSIONS = frozenset({".gz"})
+    CHUNK_SIZE = 1000
+    BASE_SELECTOR = AssetCentricSelector
+    MISSING_ID = "<MISSING_RESOURCE_ID>"
+    def as_id(self, item: Annotation) -> str:
+        project = item._cognite_client.config.project
+        return f"INTERNAL_ID_project_{project}_{item.id!s}"
+    def stream_data(self, selector: AssetCentricSelector, limit: int | None = None) -> Iterable[Page]:
+        total = 0
+        for file_chunk in FileMetadataIO(self.client).stream_data(selector, None):
+            # Todo Support pagination. This is missing in the SDK.
+            results = self.client.annotations.list(
+                filter=AnnotationFilter(
+                    annotated_resource_type="file",
+                    annotated_resource_ids=[{"id": file_metadata.id} for file_metadata in file_chunk.items],
+                )
+            )
+            if limit is not None and total + len(results) > limit:
+                results = results[: limit - total]
+            for chunk in chunker_sequence(results, self.CHUNK_SIZE):
+                yield Page(worker_id="main", items=chunk)
+                total += len(chunk)
+            if limit is not None and total >= limit:
+                break
+    def count(self, selector: AssetCentricSelector) -> int | None:
+        """There is no efficient way to count annotations in CDF."""
+        return None
+    def data_to_json_chunk(
+        self, data_chunk: Sequence[Annotation], selector: AssetCentricSelector | None = None
+    ) -> list[dict[str, JsonVal]]:
+        files_ids: set[int] = set()
+        for item in data_chunk:
+            if item.annotated_resource_type == "file" and item.annotated_resource_id is not None:
+                files_ids.add(item.annotated_resource_id)
+            if file_id := self._get_file_id(item.data):
+                files_ids.add(file_id)
+        self.client.lookup.files.external_id(list(files_ids))  # Preload file external IDs
+        asset_ids = {asset_id for item in data_chunk if (asset_id := self._get_asset_id(item.data))}
+        self.client.lookup.assets.external_id(list(asset_ids))  # Preload asset external IDs
+        return [self.dump_annotation_to_json(item) for item in data_chunk]
+    def dump_annotation_to_json(self, annotation: Annotation) -> dict[str, JsonVal]:
+        """Dump annotations to a list of JSON serializable dictionaries.
+        Args:
+            annotation: The annotations to dump.
+        Returns:
+            A list of JSON serializable dictionaries representing the annotations.
+        """
+        dumped = annotation.as_write().dump()
+        if isinstance(annotated_resource_id := dumped.pop("annotatedResourceId", None), int):
+            external_id = self.client.lookup.files.external_id(annotated_resource_id)
+            dumped["annotatedResourceExternalId"] = self.MISSING_ID if external_id is None else external_id
+        if isinstance(data := dumped.get("data"), dict):
+            if isinstance(file_ref := data.get("fileRef"), dict) and isinstance(file_ref.get("id"), int):
+                external_id = self.client.lookup.files.external_id(file_ref.pop("id"))
+                file_ref["externalId"] = self.MISSING_ID if external_id is None else external_id
+            if isinstance(asset_ref := data.get("assetRef"), dict) and isinstance(asset_ref.get("id"), int):
+                external_id = self.client.lookup.assets.external_id(asset_ref.pop("id"))
+                asset_ref["externalId"] = self.MISSING_ID if external_id is None else external_id
+        return dumped
+    @classmethod
+    def _get_file_id(cls, data: dict[str, Any]) -> int | None:
+        file_ref = data.get("fileRef")
+        if isinstance(file_ref, dict):
+            id_ = file_ref.get("id")
+            if isinstance(id_, int):
+                return id_
+        return None
+    @classmethod
+    def _get_asset_id(cls, data: dict[str, Any]) -> int | None:
+        asset_ref = data.get("assetRef")
+        if isinstance(asset_ref, dict):
+            id_ = asset_ref.get("id")
+            if isinstance(id_, int):
+                return id_
+        return None

cognite_toolkit/_cdf_tk/tracker.py CHANGED Viewed

@@ -58,7 +58,7 @@ class Tracker:
             warning_details[f"warningMostCommon{no}Count"] = count
             warning_details[f"warningMostCommon{no}Name"] = warning
-        positional_args, optional_args = self._parse_sys_args()
+        subcommands, optional_args = self._parse_sys_args()
         event_information = {
             "userInput": self.user_command,
             "toolkitVersion": __version__,
@@ -69,7 +69,7 @@ class Tracker:
             **warning_details,
             "result": type(result).__name__ if isinstance(result, Exception) else result,
             "error": str(result) if isinstance(result, Exception) else "",
-            **positional_args,
+            "subcommands": subcommands,
             **optional_args,
             "alphaFlags": [name for name, value in self._cdf_toml.alpha_flags.items() if value],
             "plugins": [name for name, value in self._cdf_toml.plugins.items() if value],
@@ -128,9 +128,9 @@ class Tracker:
         return distinct_id
     @staticmethod
-    def _parse_sys_args() -> tuple[dict[str, str], dict[str, str | bool]]:
+    def _parse_sys_args() -> tuple[list[str], dict[str, str | bool]]:
         optional_args: dict[str, str | bool] = {}
-        positional_args: dict[str, str] = {}
+        subcommands: list[str] = []
         last_key: str | None = None
         if sys.argv and len(sys.argv) > 1:
             for arg in sys.argv[1:]:
@@ -147,11 +147,11 @@ class Tracker:
                     optional_args[last_key] = arg
                     last_key = None
                 else:
-                    positional_args[f"positionalArg{len(positional_args)}"] = arg
+                    subcommands.append(arg)
             if last_key:
                 optional_args[last_key] = True
-        return positional_args, optional_args
+        return subcommands, optional_args
     @property
     def _cicd(self) -> str:

cognite_toolkit/_cdf_tk/utils/fileio/_readers.py CHANGED Viewed

@@ -7,6 +7,7 @@ from dataclasses import dataclass
 from functools import partial
 from io import TextIOWrapper
 from pathlib import Path
+from typing import Any
 import yaml
@@ -87,26 +88,20 @@ class FailedParsing:
     error: str
-class TableReader(FileReader, ABC): ...
-class CSVReader(TableReader):
-    """Reads CSV files and yields each row as a dictionary.
+class TableReader(FileReader, ABC):
+    """Reads table-like files and yields each row as a dictionary.
     Args:
-        input_file (Path): The path to the CSV file to read.
+        input_file (Path): The path to the table file to read.
         sniff_rows (int | None): Optional number of rows to sniff for
             schema detection. If None, no schema is detected. If a schema is sniffed
-            from the first `sniff_rows` rows, it will be used to parse the CSV.
+            from the first `sniff_rows` rows, it will be used to parse the table.
         schema (Sequence[SchemaColumn] | None): Optional schema to use for parsing.
             You can either provide a schema or use `sniff_rows` to detect it.
         keep_failed_cells (bool): If True, failed cells will be kept in the
             `failed_cell` attribute. If False, they will be ignored.
     """
-    format = ".csv"
     def __init__(
         self,
         input_file: Path,
@@ -152,18 +147,19 @@ class CSVReader(TableReader):
     @classmethod
     def sniff_schema(cls, input_file: Path, sniff_rows: int = 100) -> list[SchemaColumn]:
         """
-        Sniff the schema from the first `sniff_rows` rows of the CSV file.
+        Sniff the schema from the first `sniff_rows` rows of the file.
         Args:
-            input_file (Path): The path to the CSV file.
+            input_file (Path): The path to the tabular file.
             sniff_rows (int): The number of rows to read for sniffing the schema.
         Returns:
             list[SchemaColumn]: The inferred schema as a list of SchemaColumn objects.
         Raises:
             ValueError: If `sniff_rows` is not a positive integer.
             ToolkitFileNotFoundError: If the file does not exist.
-            ToolkitValueError: If the file is not a CSV file or if there are issues with the content.
+            ToolkitValueError: If the file is not the correct format or if there are issues with the content.
         """
         if sniff_rows <= 0:
@@ -171,43 +167,50 @@ class CSVReader(TableReader):
         if not input_file.exists():
             raise ToolkitFileNotFoundError(f"File not found: {input_file.as_posix()!r}.")
-        if input_file.suffix != ".csv":
-            raise ToolkitValueError(f"Expected a .csv file got a {input_file.suffix!r} file instead.")
+        if input_file.suffix != cls.format:
+            raise ToolkitValueError(f"Expected a {cls.format} file got a {input_file.suffix!r} file instead.")
-        with input_file.open("r", encoding="utf-8-sig") as file:
-            reader = csv.DictReader(file)
-            column_names = Counter(reader.fieldnames)
-            if duplicated := [name for name, count in column_names.items() if count > 1]:
-                raise ToolkitValueError(f"CSV file contains duplicate headers: {humanize_collection(duplicated)}")
-            sample_rows: list[dict[str, str]] = []
-            for no, row in enumerate(reader):
-                if no >= sniff_rows:
-                    break
-                sample_rows.append(row)
+        column_names, sample_rows = cls._read_sample_rows(input_file, sniff_rows)
+        cls._check_column_names(column_names)
+        return cls._infer_schema(sample_rows, column_names)
-            if not sample_rows:
-                raise ToolkitValueError(f"No data found in the file: {input_file.as_posix()!r}.")
+    @classmethod
+    @abstractmethod
+    def _read_sample_rows(cls, input_file: Path, sniff_rows: int) -> tuple[Sequence[str], list[dict[str, str]]]: ...
-            schema = []
-            for column_name in reader.fieldnames or []:
-                sample_values = [row[column_name] for row in sample_rows if column_name in row]
-                if not sample_values:
-                    column = SchemaColumn(name=column_name, type="string")
+    @classmethod
+    def _infer_schema(cls, sample_rows: list[dict[str, Any]], column_names: Sequence[str]) -> list[SchemaColumn]:
+        schema: list[SchemaColumn] = []
+        for column_name in column_names:
+            sample_values = [row[column_name] for row in sample_rows if column_name in row]
+            if not sample_values:
+                column = SchemaColumn(name=column_name, type="string")
+            else:
+                data_types = Counter(
+                    infer_data_type_from_value(value, dtype="Json")[0] for value in sample_values if value is not None
+                )
+                if not data_types:
+                    inferred_type = "string"
                 else:
-                    data_types = Counter(
-                        infer_data_type_from_value(value, dtype="Json")[0]
-                        for value in sample_values
-                        if value is not None
-                    )
-                    if not data_types:
-                        inferred_type = "string"
-                    else:
-                        inferred_type = data_types.most_common()[0][0]
-                    # Json dtype is a subset of Datatype that SchemaColumn accepts
-                    column = SchemaColumn(name=column_name, type=inferred_type)  # type: ignore[arg-type]
-                schema.append(column)
+                    inferred_type = data_types.most_common()[0][0]
+                # Json dtype is a subset of Datatype that SchemaColumn accepts
+                column = SchemaColumn(name=column_name, type=inferred_type)  # type: ignore[arg-type]
+            schema.append(column)
         return schema
+    @classmethod
+    def _check_column_names(cls, column_names: Sequence[str]) -> None:
+        """Check for duplicate column names."""
+        duplicates = [name for name, count in Counter(column_names).items() if count > 1]
+        if duplicates:
+            raise ToolkitValueError(f"Duplicate column names found: {humanize_collection(duplicates)}.")
+class CSVReader(TableReader):
+    """Reads CSV files and yields each row as a dictionary."""
+    format = ".csv"
     def _read_chunks_from_file(self, file: TextIOWrapper) -> Iterator[dict[str, JsonVal]]:
         if self.keep_failed_cells and self.failed_cell:
             self.failed_cell.clear()
@@ -231,10 +234,31 @@ class CSVReader(TableReader):
         with compression.open("r") as file:
             yield from csv.DictReader(file)
+    @classmethod
+    def _read_sample_rows(cls, input_file: Path, sniff_rows: int) -> tuple[Sequence[str], list[dict[str, str]]]:
+        column_names: Sequence[str] = []
+        compression = Compression.from_filepath(input_file)
+        with compression.open("r") as file:
+            reader = csv.DictReader(file)
+            column_names = reader.fieldnames or []
+            sample_rows: list[dict[str, str]] = []
+            for no, row in enumerate(reader):
+                if no >= sniff_rows:
+                    break
+                sample_rows.append(row)
+            if not sample_rows:
+                raise ToolkitValueError(f"No data found in the file: {input_file.as_posix()!r}.")
+        return column_names, sample_rows
 class ParquetReader(TableReader):
     format = ".parquet"
+    def __init__(self, input_file: Path) -> None:
+        # Parquet files have their own schema, so we don't need to sniff or provide one.
+        super().__init__(input_file, sniff_rows=None, schema=None, keep_failed_cells=False)
     def read_chunks(self) -> Iterator[dict[str, JsonVal]]:
         import pyarrow.parquet as pq
@@ -258,6 +282,28 @@ class ParquetReader(TableReader):
                 return value
         return value
+    @classmethod
+    def _read_sample_rows(cls, input_file: Path, sniff_rows: int) -> tuple[Sequence[str], list[dict[str, str]]]:
+        import pyarrow.parquet as pq
+        column_names: Sequence[str] = []
+        sample_rows: list[dict[str, str]] = []
+        with pq.ParquetFile(input_file) as parquet_file:
+            column_names = parquet_file.schema.names
+            row_count = min(sniff_rows, parquet_file.metadata.num_rows)
+            row_iter = parquet_file.iter_batches(batch_size=row_count)
+            try:
+                batch = next(row_iter)
+                for row in batch.to_pylist():
+                    str_row = {key: (str(value) if value is not None else "") for key, value in row.items()}
+                    sample_rows.append(str_row)
+            except StopIteration:
+                pass
+            if not sample_rows:
+                raise ToolkitValueError(f"No data found in the file: {input_file.as_posix()!r}.")
+        return column_names, sample_rows
 FILE_READ_CLS_BY_FORMAT: Mapping[str, type[FileReader]] = {}
 TABLE_READ_CLS_BY_FORMAT: Mapping[str, type[TableReader]] = {}

cognite-toolkit 0.6.88__py3-none-any.whl → 0.6.90__py3-none-any.whl

Potentially problematic release.

cognite-toolkit 0.6.88py3-none-any.whl → 0.6.90py3-none-any.whl