PyPI - palimpzest - Versions diffs - 0.8.1__tar.gz → 0.8.3__tar.gz - Mend

palimpzest 0.8.1tar.gz → 0.8.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

{palimpzest-0.8.1/src/palimpzest.egg-info → palimpzest-0.8.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 0.8.1
+Version: 0.8.3
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.8
-Requires-Python: >=3.8
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: anthropic>=0.55.0
@@ -22,7 +22,7 @@ Requires-Dist: colorama>=0.4.6
 Requires-Dist: datasets>=4.0.0
 Requires-Dist: fastapi~=0.115.0
 Requires-Dist: gradio>=5.26.0
-Requires-Dist: litellm>=1.73.1
+Requires-Dist: litellm>=1.76.1
 Requires-Dist: numpy==2.0.2
 Requires-Dist: openai>=1.0
 Requires-Dist: pandas>=2.1.1
@@ -44,11 +44,6 @@ Requires-Dist: tabulate>=0.9.0
 Requires-Dist: together>=1.5.5
 Requires-Dist: tqdm~=4.66.1
 Requires-Dist: rich[jupyter]>=13.9.2
-Provides-Extra: docs
-Requires-Dist: mkdocs>=1.6.1; extra == "docs"
-Requires-Dist: mkdocs-material>=9.6.3; extra == "docs"
-Requires-Dist: mkdocstrings-python>=1.15.0; extra == "docs"
-Requires-Dist: mkdocs-material[imaging]; extra == "docs"
 Provides-Extra: vllm
 Requires-Dist: vllm>=0.10.1.1; extra == "vllm"
 Dynamic: license-file

{palimpzest-0.8.1 → palimpzest-0.8.3}/pyproject.toml RENAMED Viewed

@@ -1,9 +1,9 @@
 [project]
 name = "palimpzest"
-version = "0.8.1"
+version = "0.8.3"
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 keywords = ["relational", "optimization", "llm", "AI programming", "extraction", "tools", "document", "search", "integration"]
 authors = [
     {name="MIT DSG Semantic Management Lab", email="michjc@csail.mit.edu"},
@@ -16,7 +16,7 @@ dependencies = [
     "datasets>=4.0.0",
     "fastapi~=0.115.0",
     "gradio>=5.26.0",
-    "litellm>=1.73.1",
+    "litellm>=1.76.1",
     "numpy==2.0.2",
     "openai>=1.0",
     "pandas>=2.1.1",
@@ -49,12 +49,6 @@ classifiers=[
 ]
 [project.optional-dependencies]
-docs = [
-    "mkdocs>=1.6.1",
-    "mkdocs-material>=9.6.3",
-    "mkdocstrings-python>=1.15.0",
-    "mkdocs-material[imaging]",
-]
 vllm = [
     "vllm>=0.10.1.1",
 ]

{palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/constants.py RENAMED Viewed

@@ -25,8 +25,6 @@ class Model(str, Enum):
     GPT_5_MINI = "openai/gpt-5-mini-2025-08-07"
     GPT_5_NANO = "openai/gpt-5-nano-2025-08-07"
     o4_MINI = "openai/o4-mini-2025-04-16"  # noqa: N815
-    TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
-    CLIP_VIT_B_32 = "clip-ViT-B-32"
     CLAUDE_3_5_SONNET = "anthropic/claude-3-5-sonnet-20241022"
     CLAUDE_3_7_SONNET = "anthropic/claude-3-7-sonnet-20250219"
     CLAUDE_3_5_HAIKU = "anthropic/claude-3-5-haiku-20241022"
@@ -41,6 +39,8 @@ class Model(str, Enum):
     GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
     VLLM_QWEN_1_5_0_5B_CHAT = "hosted_vllm/qwen/Qwen1.5-0.5B-Chat"
     # o1 = "o1-2024-12-17"
+    TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
+    CLIP_VIT_B_32 = "clip-ViT-B-32"
     def __repr__(self):
         return f"{self.name}"
@@ -136,69 +136,38 @@ class PromptStrategy(str, Enum):
     performing some task with a specified Model.
     """
-    # Chain-of-Thought Boolean Prompt Strategies
-    COT_BOOL = "chain-of-thought-bool"
-    COT_BOOL_NO_REASONING = "chain-of-thought-bool-no-reasoning"
-    # COT_BOOL_CRITIC = "chain-of-thought-bool-critic"
-    # COT_BOOL_REFINE = "chain-of-thought-bool-refine"
-    # Chain-of-Thought Boolean with Image Prompt Strategies
-    COT_BOOL_IMAGE = "chain-of-thought-bool-image"
-    COT_BOOL_IMAGE_NO_REASONING = "chain-of-thought-bool-image"
-    COT_BOOL_AUDIO = "chain-of-thought-bool-audio"
-    COT_BOOL_AUDIO_NO_REASONING = "chain-of-thought-bool-audio"
-    # COT_BOOL_IMAGE_CRITIC = "chain-of-thought-bool-image-critic"
-    # COT_BOOL_IMAGE_REFINE = "chain-of-thought-bool-image-refine"
-    # Chain-of-Thought Join Prompt Strategies
-    COT_JOIN = "chain-of-thought-join"
-    COT_JOIN_NO_REASONING = "chain-of-thought-join-no-reasoning"
-    COT_JOIN_IMAGE = "chain-of-thought-join-image"
-    COT_JOIN_IMAGE_NO_REASONING = "chain-of-thought-join-image-no-reasoning"
-    COT_JOIN_AUDIO = "chain-of-thought-join-audio"
-    COT_JOIN_AUDIO_NO_REASONING = "chain-of-thought-join-audio-no-reasoning"
-    # Chain-of-Thought Question Answering Prompt Strategies
-    COT_QA = "chain-of-thought-question"
-    COT_QA_NO_REASONING = "chain-of-thought-question-no-reasoning"
-    COT_QA_CRITIC = "chain-of-thought-question-critic"
-    COT_QA_REFINE = "chain-of-thought-question-refine"
-    # Chain-of-Thought Question with Image Prompt Strategies
-    COT_QA_IMAGE = "chain-of-thought-question-image"
-    COT_QA_IMAGE_NO_REASONING = "chain-of-thought-question-image-no-reasoning"
-    COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
-    COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
-    # Chain-of-Thought Queestion with Audio Prompt Strategies
-    COT_QA_AUDIO = "chain-of-thought-question-audio"
-    COT_QA_AUDIO_NO_REASONING = "chain-of-thought-question-audio-no-reasoning"
-    # TODO: COT_QA_AUDIO_CRITIC/REFINE
-    # Mixture-of-Agents Prompt Strategies
-    COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
-    COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
-    COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
-    # TODO: COT_MOA_PROPOSER_AUDIO
-    # Split Convert Prompt Strategies
-    SPLIT_PROPOSER = "split-proposer"
-    SPLIT_MERGER = "split-merger"
-    def is_image_prompt(self):
-        return "image" in self.value
-    def is_audio_prompt(self):
-        return "audio" in self.value
-    def is_bool_prompt(self):
-        return "bool" in self.value
+    # filter prompt strategies
+    FILTER = "filter"
+    FILTER_NO_REASONING = "filter-no-reasoning"
+    FILTER_CRITIC = "filter-critic"
+    FILTER_REFINE = "filter-refine"
+    FILTER_MOA_PROPOSER = "filter-mixture-of-agents-proposer"
+    FILTER_MOA_AGG = "filter-mixture-of-agents-aggregation"
+    FILTER_SPLIT_PROPOSER = "filter-split-proposer"
+    FILTER_SPLIT_MERGER = "filter-split-merger"
+    # join prompt strategies
+    JOIN = "join"
+    JOIN_NO_REASONING = "join-no-reasoning"
+    # map prompt strategies
+    MAP = "map"
+    MAP_NO_REASONING = "map-no-reasoning"
+    MAP_CRITIC = "map-critic"
+    MAP_REFINE = "map-refine"
+    MAP_MOA_PROPOSER = "map-mixture-of-agents-proposer"
+    MAP_MOA_AGG = "map-mixture-of-agents-aggregation"
+    MAP_SPLIT_PROPOSER = "map-split-proposer"
+    MAP_SPLIT_MERGER = "map-split-merger"
+    def is_filter_prompt(self):
+        return "filter" in self.value
     def is_join_prompt(self):
         return "join" in self.value
-    def is_convert_prompt(self):
-        return "bool" not in self.value and "join" not in self.value
+    def is_map_prompt(self):
+        return "map" in self.value
     def is_critic_prompt(self):
         return "critic" in self.value
@@ -221,6 +190,13 @@ class PromptStrategy(str, Enum):
     def is_no_reasoning_prompt(self):
         return "no-reasoning" in self.value
+class Modality(str, Enum):
+    TEXT = "text"
+    IMAGE = "image"
+    AUDIO = "audio"
 class AggFunc(str, Enum):
     COUNT = "count"
     AVERAGE = "average"
@@ -527,7 +503,7 @@ CLIP_VIT_B_32_MODEL_CARD = {
     ##### Time #####
     "seconds_per_output_token": 0.0098,  # NOTE: just copying TEXT_EMBEDDING_3_SMALL_MODEL_CARD for now
     ##### Agg. Benchmark #####
-    "overall": 63.3,  # NOTE: ImageNet top-1 accuracy
+    "overall": 63.3,  # NOTE: imageNet top-1 accuracy
 }
 CLAUDE_3_5_SONNET_MODEL_CARD = {
     ##### Cost in USD #####

{palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/data/dataset.py RENAMED Viewed

@@ -595,7 +595,7 @@ class Dataset:
         return QueryProcessorFactory.create_and_run_processor(self, config)
-    def optimize_and_run(self, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, config: QueryProcessorConfig | None = None, **kwargs):
+    def optimize_and_run(self, config: QueryProcessorConfig | None = None, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, **kwargs):
         """Optimize the PZ program using the train_dataset and validator before running the optimized plan."""
         # TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR
         from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory

{palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/data/iter_dataset.py RENAMED Viewed

@@ -227,7 +227,7 @@ class HTMLFileDataset(BaseFileDataset):
             path (str): The path to the directory
         """
         super().__init__(path=path, id=id, schema=WebPage)
-        assert all([filename.endswith(tuple(constants.HTML_EXTENSIONS)) for filename in self.filepaths])
+        self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.HTML_EXTENSIONS))]
     def _html_to_text_with_links(self, html: str) -> str:
         # Parse the HTML content
@@ -295,7 +295,7 @@ class ImageFileDataset(BaseFileDataset):
             path (str): The path to the directory
         """
         super().__init__(path=path, id=id, schema=ImageFile)
-        assert all([filename.endswith(tuple(constants.IMAGE_EXTENSIONS)) for filename in self.filepaths])
+        self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.IMAGE_EXTENSIONS))]
     def __getitem__(self, idx: int) -> dict:
         """
@@ -347,7 +347,7 @@ class PDFFileDataset(BaseFileDataset):
             file_cache_dir (str): The directory to store the temporary files generated during PDF processing
         """
         super().__init__(path=path, id=id, schema=PDFFile)
-        assert all([filename.endswith(tuple(constants.PDF_EXTENSIONS)) for filename in self.filepaths])
+        self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.PDF_EXTENSIONS))]
         self.pdfprocessor = pdfprocessor
         self.file_cache_dir = file_cache_dir
@@ -432,7 +432,7 @@ class XLSFileDataset(BaseFileDataset):
         Constructor for the `XLSFileDataset` class. The `schema` is set to the `XLSFile` schema.
         """
         super().__init__(path=path, id=id, schema=XLSFile)
-        assert all([filename.endswith(tuple(constants.XLS_EXTENSIONS)) for filename in self.filepaths])
+        self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.XLS_EXTENSIONS))]
     def __getitem__(self, idx: int) -> dict:
         """
@@ -483,7 +483,7 @@ class AudioFileDataset(BaseFileDirectoryDataset):
             path (str): The path to the directory
         """
         super().__init__(path=path, id=id, schema=AudioFile)
-        assert all([filename.endswith(tuple(constants.AUDIO_EXTENSIONS)) for filename in self.filepaths])
+        self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.AUDIO_EXTENSIONS))]
     def __getitem__(self, idx: int) -> dict:
         """

{palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/elements/groupbysig.py RENAMED Viewed

@@ -16,7 +16,7 @@ class GroupBySig:
         self.agg_funcs = agg_funcs
         self.agg_fields = agg_fields
-    def validate_schema(self, input_schema: BaseModel) -> tuple[bool, str | None]:
+    def validate_schema(self, input_schema: type[BaseModel]) -> tuple[bool, str | None]:
         for f in self.group_by_fields:
             if f not in input_schema.model_fields:
                 return (False, "Supplied schema has no field " + f)

{palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/elements/records.py RENAMED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import json
 from collections.abc import Generator
+from copy import deepcopy
 from typing import Any
 import pandas as pd
@@ -28,8 +29,8 @@ class DataRecord:
     def __init__(
         self,
-        schema: BaseModel,
-        source_indices: str | list[str],
+        data_item: BaseModel,
+        source_indices: str | int | list[str | int],
         parent_ids: str | list[str] | None = None,
         cardinality_idx: int | None = None,
     ):
@@ -44,27 +45,21 @@ class DataRecord:
         if isinstance(parent_ids, str):
             parent_ids = [parent_ids]
-        # schema for the data record
-        self.schema = schema
-        # mapping from field names to Field objects; effectively a mapping from a field name to its type
-        self.field_types: dict[str, FieldInfo] = schema.model_fields
-        # mapping from field names to their values
-        self.field_values: dict[str, Any] = {}
+        # data for the data record
+        self._data_item = data_item
         # the index in the root Dataset from which this DataRecord is derived;
         # each source index takes the form: f"{root_dataset.id}-{idx}"
-        self.source_indices = sorted(source_indices)
+        self._source_indices = sorted(source_indices)
         # the id(s) of the parent record(s) from which this DataRecord is derived
-        self.parent_ids = parent_ids
+        self._parent_ids = parent_ids
         # store the cardinality index
-        self.cardinality_idx = cardinality_idx
+        self._cardinality_idx = cardinality_idx
         # indicator variable which may be flipped by filter operations to signal when a record has been filtered out
-        self.passed_operator = True
+        self._passed_operator = True
         # NOTE: Record ids are hashed based on:
         # 0. their schema (keys)
@@ -78,106 +73,98 @@ class DataRecord:
         # We may revisit this hashing scheme in the future.
         # unique identifier for the record
+        schema_fields = sorted(list(type(data_item).model_fields))
         id_str = (
-            str(schema) + str(parent_ids) if parent_ids is not None else str(self.source_indices)
+            str(schema_fields) + str(parent_ids) if parent_ids is not None else str(self._source_indices)
             if cardinality_idx is None
-            else str(schema) + str(cardinality_idx) + str(parent_ids) if parent_ids is not None else str(self.source_indices)
+            else str(schema_fields) + str(cardinality_idx) + str(parent_ids) if parent_ids is not None else str(self._source_indices)
         )
-        # TODO(Jun): build-in id should has a special name, the current self.id is too general which would conflict with user defined schema too easily.
-        # the options: built_in_id, generated_id
-        self.id = hash_for_id(id_str)
+        self._id = hash_for_id(id_str)
+    # TODO: raise an exception if one of these fields is present in the schema
+    # - put these in a constant list up top
+    # - import the constant list in Dataset (if possible) and check at plan creation time
     def __setattr__(self, name: str, value: Any, /) -> None:
-        if name in ["schema", "field_types", "field_values", "source_indices", "parent_ids", "cardinality_idx", "passed_operator", "id"]:
+        if name in ["_data_item", "_source_indices", "_parent_ids", "_cardinality_idx", "_passed_operator", "_id"]:
             super().__setattr__(name, value)
         else:
-            self.field_values[name] = value
+            setattr(self._data_item, name, value)
     def __getattr__(self, name: str) -> Any:
-        if name == "field_values":
-            pass
-        elif name in self.field_values:
-            return self.field_values[name]
-        else:
-            raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+        field = getattr(self._data_item, name, None)
+        if field is not None:
+            return field
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
     def __getitem__(self, field: str) -> Any:
-        return self.__getattr__(field)
+        return getattr(self._data_item, field)
     def __setitem__(self, field: str, value: Any) -> None:
-        self.__setattr__(field, value)
+        setattr(self._data_item, field, value)
     def __str__(self, truncate: int | None = 15) -> str:
         if truncate is not None:
-            items = (f"{k}={str(v)[:truncate]!r}{'...' if len(str(v)) > truncate else ''}" for k, v in sorted(self.field_values.items()))
+            items = (f"{k}={str(v)[:truncate]!r}{'...' if len(str(v)) > truncate else ''}" for k, v in sorted(self._data_item.model_dump().items()))
         else:
-            items = (f"{k}={v!r}" for k, v in sorted(self.field_values.items()))
+            items = (f"{k}={v!r}" for k, v in sorted(self._data_item.model_dump().items()))
         return "{}({})".format(type(self).__name__, ", ".join(items))
     def __repr__(self) -> str:
         return self.__str__(truncate=None)
     def __eq__(self, other):
-        return isinstance(other, DataRecord) and self.field_values == other.field_values and self.schema == other.schema
+        return isinstance(other, DataRecord) and self._data_item == other._data_item
     def __hash__(self):
         return hash(self.to_json_str(bytes_to_str=True, sorted=True))
     def __iter__(self):
-        yield from self.field_values.items()
+        yield from self._data_item.__iter__()
     def get_field_names(self):
-        return list(self.field_values.keys())
+        return list(type(self._data_item).model_fields.keys())
     def get_field_type(self, field_name: str) -> FieldInfo:
-        return self.field_types[field_name]
+        return type(self._data_item).model_fields[field_name]
+    @property
+    def schema(self) -> type[BaseModel]:
+        return type(self._data_item)
+    def copy(self):
+        # get the set of fields to copy from the parent record
+        copy_field_names = [field.split(".")[-1] for field in self.get_field_names()]
+        # copy field types and values from the parent
+        data_item = {field_name: self[field_name] for field_name in copy_field_names}
-    def copy(self, include_bytes: bool = True, project_cols: list[str] | None = None):
         # make copy of the current record
         new_dr = DataRecord(
-            self.schema,
-            source_indices=self.source_indices,
-            parent_ids=self.parent_ids,
-            cardinality_idx=self.cardinality_idx,
+            self.schema(**data_item),
+            source_indices=self._source_indices,
+            parent_ids=self._parent_ids,
+            cardinality_idx=self._cardinality_idx,
         )
         # copy the passed_operator attribute
-        new_dr.passed_operator = self.passed_operator
-        # get the set of fields to copy from the parent record
-        copy_field_names = project_cols if project_cols is not None else self.get_field_names()
-        copy_field_names = [field.split(".")[-1] for field in copy_field_names]
-        # copy field types and values from the parent
-        for field_name in copy_field_names:
-            field_type = self.get_field_type(field_name)
-            field_value = self[field_name]
-            if (
-                not include_bytes
-                and isinstance(field_value, bytes)
-                or (isinstance(field_value, list) and len(field_value) > 0 and isinstance(field_value[0], bytes))
-            ):
-                continue
-            # set field and value
-            new_dr.field_types[field_name] = field_type
-            new_dr[field_name] = field_value
+        new_dr._passed_operator = self._passed_operator
         return new_dr
     @staticmethod
     def from_parent(
-        schema: BaseModel,
+        schema: type[BaseModel],
+        data_item: dict,
         parent_record: DataRecord,
         project_cols: list[str] | None = None,
         cardinality_idx: int | None = None,
@@ -194,29 +181,33 @@ class DataRecord:
             new_schema = union_schemas([schema, parent_record.schema])
             new_schema = project(new_schema, project_cols)
-        # make new record which has parent_record as its parent (and the same source_indices)
-        new_dr = DataRecord(
-            new_schema,
-            source_indices=parent_record.source_indices,
-            parent_ids=[parent_record.id],
-            cardinality_idx=cardinality_idx,
-        )
         # get the set of fields and field descriptions to copy from the parent record
-        copy_field_names = project_cols if project_cols is not None else parent_record.get_field_names()
+        copy_field_names = parent_record.get_field_names() if project_cols is None else project_cols
         copy_field_names = [field.split(".")[-1] for field in copy_field_names]
         # copy fields from the parent
-        for field_name in copy_field_names:
-            new_dr.field_types[field_name] = parent_record.get_field_type(field_name)
-            new_dr[field_name] = parent_record[field_name]
+        data_item.update({field_name: parent_record[field_name] for field_name in copy_field_names})
-        return new_dr
+        # corner-case: wrap values in lists if the new schema expects a list but the data item has a single value
+        for field_name, field_info in new_schema.model_fields.items():
+            field_should_be_list = hasattr(field_info.annotation, '__origin__') and field_info.annotation.__origin__ is list
+            field_is_not_list = field_name in data_item and not isinstance(data_item[field_name], list)
+            if field_should_be_list and field_is_not_list:
+                data_item[field_name] = [data_item[field_name]]
+        # make new record which has parent_record as its parent (and the same source_indices)
+        new_dr = DataRecord(
+            new_schema(**data_item),
+            source_indices=parent_record._source_indices,
+            parent_ids=[parent_record._id],
+            cardinality_idx=cardinality_idx,
+        )
+        return new_dr
     @staticmethod
     def from_agg_parents(
-        schema: BaseModel,
+        data_item: BaseModel,
         parent_records: DataRecordSet,
         cardinality_idx: int | None = None,
     ) -> DataRecord:
@@ -224,33 +215,25 @@ class DataRecord:
         source_indices = [
             source_idx
             for parent_record in parent_records
-            for source_idx in parent_record.source_indices
+            for source_idx in parent_record._source_indices
         ]
         # make new record which has all parent records as its parents
         return DataRecord(
-            schema,
+            data_item,
             source_indices=source_indices,
-            parent_ids=[parent_record.id for parent_record in parent_records],
+            parent_ids=[parent_record._id for parent_record in parent_records],
             cardinality_idx=cardinality_idx,
         )
     @staticmethod
     def from_join_parents(
-        schema: BaseModel,
+        schema: type[BaseModel],
         left_parent_record: DataRecord,
         right_parent_record: DataRecord,
         project_cols: list[str] | None = None,
         cardinality_idx: int = None,
     ) -> DataRecord:
-        # make new record which has left and right parent record as its parents
-        new_dr = DataRecord(
-            schema,
-            source_indices=list(left_parent_record.source_indices) + list(right_parent_record.source_indices),
-            parent_ids=[left_parent_record.id, right_parent_record.id],
-            cardinality_idx=cardinality_idx,
-        )
         # get the set of fields and field descriptions to copy from the parent record(s)
         left_copy_field_names = (
             left_parent_record.get_field_names()
@@ -266,23 +249,26 @@ class DataRecord:
         right_copy_field_names = [field.split(".")[-1] for field in right_copy_field_names]
         # copy fields from the parents
-        for field_name in left_copy_field_names:
-            new_dr.field_types[field_name] = left_parent_record.get_field_type(field_name)
-            new_dr[field_name] = left_parent_record[field_name]
+        data_item = {field_name: left_parent_record[field_name] for field_name in left_copy_field_names}
         for field_name in right_copy_field_names:
             new_field_name = field_name
             if field_name in left_copy_field_names:
                 new_field_name = f"{field_name}_right"
-            new_dr.field_types[new_field_name] = right_parent_record.get_field_type(field_name)
-            new_dr[new_field_name] = right_parent_record[field_name]
+            data_item[new_field_name] = right_parent_record[field_name]
-        return new_dr
+        # make new record which has left and right parent record as its parents
+        new_dr = DataRecord(
+            schema(**data_item),
+            source_indices=list(left_parent_record._source_indices) + list(right_parent_record._source_indices),
+            parent_ids=[left_parent_record._id, right_parent_record._id],
+            cardinality_idx=cardinality_idx,
+        )
+        return new_dr
     # TODO: unused outside of unit tests
     @staticmethod
-    def from_df(df: pd.DataFrame, schema: BaseModel | None = None) -> list[DataRecord]:
+    def from_df(df: pd.DataFrame, schema: type[BaseModel] | None = None) -> list[DataRecord]:
         """Create a list of DataRecords from a pandas DataFrame
         Args:
@@ -309,9 +295,7 @@ class DataRecord:
         records = []
         for idx, row in df.iterrows():
             row_dict = row.to_dict()
-            record = DataRecord(schema=schema, source_indices=[f"{dataset_id}-{idx}"])
-            record.field_values = row_dict
-            record.field_types = {field_name: schema.model_fields[field_name] for field_name in row_dict}
+            record = DataRecord(schema(**row_dict), source_indices=[f"{dataset_id}-{idx}"])
             records.append(record)
         return records
@@ -346,9 +330,8 @@ class DataRecord:
         # TODO(chjun): In case of numpy types, the json.dumps will fail. Convert to native types.
         # Better ways to handle this.
         field_values = {
-            k: v.description
-            if isinstance(v, context.Context) else v
-            for k, v in self.field_values.items()
+            k: v.description if isinstance(v, context.Context) else v
+            for k, v in self._data_item.model_dump().items()
         }
         dct = pd.Series(field_values).to_dict()
@@ -358,7 +341,7 @@ class DataRecord:
         if not include_bytes:
             for k in dct:
-                field_type = self.field_types[k]
+                field_type = self.get_field_type(k)
                 if field_type.annotation in [bytes, AudioBase64, ImageBase64, list[bytes], list[ImageBase64]]:
                     dct[k] = "<bytes>"
@@ -374,11 +357,11 @@ class DataRecord:
         if mask_filepaths:
             for k in dct:
-                field_type = self.field_types[k]
+                field_type = self.get_field_type(k)
                 if field_type.annotation in [AudioBase64, AudioFilepath, ImageBase64, ImageFilepath, ImageURL]:
                     dct[k] = "<bytes>"
-        return dct
+        return deepcopy(dct)
 class DataRecordSet:
@@ -399,8 +382,8 @@ class DataRecordSet:
         # set data_records, parent_ids, and source_indices; note that it is possible for
         # data_records to be an empty list in the event of a failed convert
         self.data_records = data_records
-        self.parent_ids = data_records[0].parent_ids if len(data_records) > 0 else None
-        self.source_indices = data_records[0].source_indices if len(data_records) > 0 else None
+        self.parent_ids = data_records[0]._parent_ids if len(data_records) > 0 else None
+        self.source_indices = data_records[0]._source_indices if len(data_records) > 0 else None
         self.schema = data_records[0].schema if len(data_records) > 0 else None
         # the input to the operator which produced the data_records; type is tuple[DataRecord] | tuple[int]
@@ -448,7 +431,6 @@ class DataRecordCollection:
         DataRecordSet is used for the output of executing an operator.
         DataRecordCollection is used for the output of executing a query, we definitely could extend it to support more advanced features for output of execute().
     """
-    # TODO(Jun): consider to have stats_manager class to centralize stats management.
     def __init__(self, data_records: list[DataRecord], execution_stats: ExecutionStats | None = None, plan_stats: PlanStats | None = None):
         self.data_records = data_records
         self.execution_stats = execution_stats

palimpzest 0.8.1__tar.gz → 0.8.3__tar.gz

palimpzest 0.8.1tar.gz → 0.8.3tar.gz