PyPI - palimpzest - Versions diffs - 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

palimpzest 0.7.21py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +343 -209
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +639 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +62 -6
palimpzest/prompts/filter_prompts.py +51 -6
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
palimpzest/prompts/prompt_factory.py +375 -47
palimpzest/prompts/split_proposer_prompts.py +1 -1
palimpzest/prompts/util_phrases.py +5 -0
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +160 -331
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +33 -19
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +26 -16
palimpzest/query/operators/join.py +403 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +205 -77
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +42 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +32 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
palimpzest-0.8.1.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.21.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0

palimpzest/core/elements/records.py CHANGED Viewed

@@ -5,40 +5,60 @@ from collections.abc import Generator
 from typing import Any
 import pandas as pd
-from palimpzest.core.data.dataclasses import ExecutionStats, PlanStats, RecordOpStats
-from palimpzest.core.lib.fields import Field
-from palimpzest.core.lib.schemas import Schema
-from palimpzest.utils.hash_helpers import hash_for_id
+from pydantic import BaseModel
+from pydantic.fields import FieldInfo
+from palimpzest.core.data import context
+from palimpzest.core.lib.schemas import (
+    AudioBase64,
+    AudioFilepath,
+    ImageBase64,
+    ImageFilepath,
+    ImageURL,
+    create_schema_from_df,
+    project,
+    union_schemas,
+)
+from palimpzest.core.models import ExecutionStats, PlanStats, RecordOpStats
+from palimpzest.utils.hash_helpers import hash_for_id, hash_for_serialized_dict
 class DataRecord:
-    """A DataRecord is a single record of data matching some Schema."""
+    """A DataRecord is a single record of data matching some schema defined by a BaseModel."""
     def __init__(
         self,
-        schema: Schema,
-        source_idx: int,
-        parent_id: str | None = None,
+        schema: BaseModel,
+        source_indices: str | list[str],
+        parent_ids: str | list[str] | None = None,
         cardinality_idx: int | None = None,
     ):
-        # check that source_idx is provided
-        assert source_idx is not None, "Every DataRecord must be constructed with a source_idx"
+        # check that source_indices are provided
+        assert source_indices is not None, "Every DataRecord must be constructed with source index (or indices)"
+        # normalize to list[str]
+        if not isinstance(source_indices, list):
+            source_indices = [source_indices]
+        # normalize to list[str]
+        if isinstance(parent_ids, str):
+            parent_ids = [parent_ids]
         # schema for the data record
         self.schema = schema
         # mapping from field names to Field objects; effectively a mapping from a field name to its type
-        self.field_types: dict[str, Field] = schema.field_map()
+        self.field_types: dict[str, FieldInfo] = schema.model_fields
         # mapping from field names to their values
         self.field_values: dict[str, Any] = {}
-        # the index in the DataReader from which this DataRecord is derived
-        self.source_idx = int(source_idx)
+        # the index in the root Dataset from which this DataRecord is derived;
+        # each source index takes the form: f"{root_dataset.id}-{idx}"
+        self.source_indices = sorted(source_indices)
-        # the id of the parent record(s) from which this DataRecord is derived
-        self.parent_id = parent_id
+        # the id(s) of the parent record(s) from which this DataRecord is derived
+        self.parent_ids = parent_ids
         # store the cardinality index
         self.cardinality_idx = cardinality_idx
@@ -48,7 +68,7 @@ class DataRecord:
         # NOTE: Record ids are hashed based on:
         # 0. their schema (keys)
-        # 1. their parent record id(s) (or source_idx if there is no parent record)
+        # 1. their parent record id(s) (or source_indices if there is no parent record)
         # 2. their index in the fan out (if this is in a one-to-many operation)
         #
         # We currently do NOT hash just based on record content (i.e. schema (key, value) pairs)
@@ -59,9 +79,9 @@ class DataRecord:
         # unique identifier for the record
         id_str = (
-            str(schema) + (parent_id if parent_id is not None else str(self.source_idx))
+            str(schema) + str(parent_ids) if parent_ids is not None else str(self.source_indices)
             if cardinality_idx is None
-            else str(schema) + str(cardinality_idx) + str(parent_id if parent_id is not None else str(self.source_idx))
+            else str(schema) + str(cardinality_idx) + str(parent_ids) if parent_ids is not None else str(self.source_indices)
         )
         # TODO(Jun): build-in id should has a special name, the current self.id is too general which would conflict with user defined schema too easily.
         # the options: built_in_id, generated_id
@@ -69,7 +89,7 @@ class DataRecord:
     def __setattr__(self, name: str, value: Any, /) -> None:
-        if name in ["schema", "field_types", "field_values", "source_idx", "parent_id", "cardinality_idx", "passed_operator", "id"]:
+        if name in ["schema", "field_types", "field_values", "source_indices", "parent_ids", "cardinality_idx", "passed_operator", "id"]:
             super().__setattr__(name, value)
         else:
             self.field_values[name] = value
@@ -103,11 +123,10 @@ class DataRecord:
         return self.__str__(truncate=None)
     def __eq__(self, other):
-        return isinstance(other, DataRecord) and self.field_values == other.field_values and self.schema.get_desc() == other.schema.get_desc()
+        return isinstance(other, DataRecord) and self.field_values == other.field_values and self.schema == other.schema
     def __hash__(self):
-        return hash(self.to_json_str(bytes_to_str=True))
+        return hash(self.to_json_str(bytes_to_str=True, sorted=True))
     def __iter__(self):
@@ -118,16 +137,16 @@ class DataRecord:
         return list(self.field_values.keys())
-    def get_field_type(self, field_name: str) -> Field:
+    def get_field_type(self, field_name: str) -> FieldInfo:
         return self.field_types[field_name]
     def copy(self, include_bytes: bool = True, project_cols: list[str] | None = None):
-        # make new record which has parent_record as its parent (and the same source_idx)
+        # make copy of the current record
         new_dr = DataRecord(
             self.schema,
-            source_idx=self.source_idx,
-            parent_id=self.id,
+            source_indices=self.source_indices,
+            parent_ids=self.parent_ids,
             cardinality_idx=self.cardinality_idx,
         )
@@ -158,25 +177,28 @@ class DataRecord:
     @staticmethod
     def from_parent(
-        schema: Schema,
+        schema: BaseModel,
         parent_record: DataRecord,
         project_cols: list[str] | None = None,
         cardinality_idx: int | None = None,
     ) -> DataRecord:
-        # project_cols must be None or contain at least one column
-        assert project_cols is None or len(project_cols) >= 1, "must have at least one column if using projection"
         # if project_cols is None, then the new schema is a union of the provided schema and parent_record.schema;
+        # if project_cols is an empty list, then the new schema is simply the provided schema
         # otherwise, it's a ProjectSchema
-        new_schema = schema.union(parent_record.schema)
-        if project_cols is not None:
-            new_schema = new_schema.project(project_cols)
+        new_schema = None
+        if project_cols is None:
+            new_schema = union_schemas([schema, parent_record.schema])
+        elif project_cols == []:
+            new_schema = schema
+        else:
+            new_schema = union_schemas([schema, parent_record.schema])
+            new_schema = project(new_schema, project_cols)
-        # make new record which has parent_record as its parent (and the same source_idx)
+        # make new record which has parent_record as its parent (and the same source_indices)
         new_dr = DataRecord(
             new_schema,
-            source_idx=parent_record.source_idx,
-            parent_id=parent_record.id,
+            source_indices=parent_record.source_indices,
+            parent_ids=[parent_record.id],
             cardinality_idx=cardinality_idx,
         )
@@ -194,35 +216,78 @@ class DataRecord:
     @staticmethod
     def from_agg_parents(
-        schema: Schema,
+        schema: BaseModel,
         parent_records: DataRecordSet,
-        project_cols: list[str] | None = None,
         cardinality_idx: int | None = None,
     ) -> DataRecord:
-        # TODO: we can implement this once we support having multiple parent ids
-        pass
+        # flatten source indices from all parents
+        source_indices = [
+            source_idx
+            for parent_record in parent_records
+            for source_idx in parent_record.source_indices
+        ]
+        # make new record which has all parent records as its parents
+        return DataRecord(
+            schema,
+            source_indices=source_indices,
+            parent_ids=[parent_record.id for parent_record in parent_records],
+            cardinality_idx=cardinality_idx,
+        )
     @staticmethod
     def from_join_parents(
-        left_schema: Schema,
-        right_schema: Schema,
+        schema: BaseModel,
         left_parent_record: DataRecord,
         right_parent_record: DataRecord,
         project_cols: list[str] | None = None,
         cardinality_idx: int = None,
     ) -> DataRecord:
-        # TODO: we can implement this method if/when we add joins
-        pass
+        # make new record which has left and right parent record as its parents
+        new_dr = DataRecord(
+            schema,
+            source_indices=list(left_parent_record.source_indices) + list(right_parent_record.source_indices),
+            parent_ids=[left_parent_record.id, right_parent_record.id],
+            cardinality_idx=cardinality_idx,
+        )
+        # get the set of fields and field descriptions to copy from the parent record(s)
+        left_copy_field_names = (
+            left_parent_record.get_field_names()
+            if project_cols is None
+            else [col for col in project_cols if col in left_parent_record.get_field_names()]
+        )
+        right_copy_field_names = (
+            right_parent_record.get_field_names()
+            if project_cols is None
+            else [col for col in project_cols if col in right_parent_record.get_field_names()]
+        )
+        left_copy_field_names = [field.split(".")[-1] for field in left_copy_field_names]
+        right_copy_field_names = [field.split(".")[-1] for field in right_copy_field_names]
+        # copy fields from the parents
+        for field_name in left_copy_field_names:
+            new_dr.field_types[field_name] = left_parent_record.get_field_type(field_name)
+            new_dr[field_name] = left_parent_record[field_name]
+        for field_name in right_copy_field_names:
+            new_field_name = field_name
+            if field_name in left_copy_field_names:
+                new_field_name = f"{field_name}_right"
+            new_dr.field_types[new_field_name] = right_parent_record.get_field_type(field_name)
+            new_dr[new_field_name] = right_parent_record[field_name]
+        return new_dr
+    # TODO: unused outside of unit tests
     @staticmethod
-    def from_df(df: pd.DataFrame, schema: Schema | None = None) -> list[DataRecord]:
+    def from_df(df: pd.DataFrame, schema: BaseModel | None = None) -> list[DataRecord]:
         """Create a list of DataRecords from a pandas DataFrame
         Args:
             df (pd.DataFrame): Input DataFrame
-            schema (Schema, optional): Schema for the DataRecords. If None, will be derived from DataFrame
+            schema (BaseModel, optional): Schema for the DataRecords. If None, will be derived from DataFrame
         Returns:
             list[DataRecord]: List of DataRecord instances
@@ -230,16 +295,23 @@ class DataRecord:
         if df is None:
             raise ValueError("DataFrame is None!")
-        records = []
+        # create schema if one isn't provided
         if schema is None:
-            schema = Schema.from_df(df)
+            schema = create_schema_from_df(df)
+        # create an id for the dataset from the schema
+        dataset_id = hash_for_serialized_dict({
+            k: {"annotation": str(v.annotation), "default": str(v.default), "description": v.description}
+            for k, v in schema.model_fields.items()
+        })
-        field_map = schema.field_map()
-        for source_idx, row in df.iterrows():
+        # create records
+        records = []
+        for idx, row in df.iterrows():
             row_dict = row.to_dict()
-            record = DataRecord(schema=schema, source_idx=source_idx)
+            record = DataRecord(schema=schema, source_indices=[f"{dataset_id}-{idx}"])
             record.field_values = row_dict
-            record.field_types = {field_name: field_map[field_name] for field_name in row_dict}
+            record.field_types = {field_name: schema.model_fields[field_name] for field_name in row_dict}
             records.append(record)
         return records
@@ -253,33 +325,41 @@ class DataRecord:
         if project_cols is not None and len(project_cols) > 0:
             fields = [field for field in fields if field in project_cols]
+        # convert Context --> str
+        for record in records:
+            for k in fields:
+                if isinstance(record[k], context.Context):
+                    record[k] = record[k].description
         return pd.DataFrame([
             {k: record[k] for k in fields}
             for record in records
         ])
-    def to_json_str(self, include_bytes: bool = True, bytes_to_str: bool = False, project_cols: list[str] | None = None):
+    def to_json_str(self, include_bytes: bool = True, bytes_to_str: bool = False, project_cols: list[str] | None = None, sorted: bool = False):
         """Return a JSON representation of this DataRecord"""
-        record_dict = self.to_dict(include_bytes, bytes_to_str, project_cols)
-        record_dict = {
-            field_name: self.schema.field_to_json(field_name, field_value)
-            for field_name, field_value in record_dict.items()
-        }
+        record_dict = self.to_dict(include_bytes, bytes_to_str, project_cols, sorted)
         return json.dumps(record_dict, indent=2)
-    def to_dict(self, include_bytes: bool = True, bytes_to_str: bool = False, project_cols: list[str] | None = None):
+    def to_dict(self, include_bytes: bool = True, bytes_to_str: bool = False, project_cols: list[str] | None = None, _sorted: bool = False, mask_filepaths: bool = False):
         """Return a dictionary representation of this DataRecord"""
         # TODO(chjun): In case of numpy types, the json.dumps will fail. Convert to native types.
         # Better ways to handle this.
-        dct = pd.Series(self.field_values).to_dict()
+        field_values = {
+            k: v.description
+            if isinstance(v, context.Context) else v
+            for k, v in self.field_values.items()
+        }
+        dct = pd.Series(field_values).to_dict()
         if project_cols is not None and len(project_cols) > 0:
             project_field_names = set(field.split(".")[-1] for field in project_cols)
             dct = {k: v for k, v in dct.items() if k in project_field_names}
         if not include_bytes:
-            for k, v in dct.items():
-                if isinstance(v, bytes) or (isinstance(v, list) and len(v) > 0 and  any([isinstance(elt, bytes) for elt in v])):
+            for k in dct:
+                field_type = self.field_types[k]
+                if field_type.annotation in [bytes, AudioBase64, ImageBase64, list[bytes], list[ImageBase64]]:
                     dct[k] = "<bytes>"
         if bytes_to_str:
@@ -289,12 +369,21 @@ class DataRecord:
                 elif isinstance(v, list) and len(v) > 0 and any([isinstance(elt, bytes) for elt in v]):
                     dct[k] = [elt.decode("utf-8") if isinstance(elt, bytes) else elt for elt in v]
+        if _sorted:
+            dct = dict(sorted(dct.items()))
+        if mask_filepaths:
+            for k in dct:
+                field_type = self.field_types[k]
+                if field_type.annotation in [AudioBase64, AudioFilepath, ImageBase64, ImageFilepath, ImageURL]:
+                    dct[k] = "<bytes>"
         return dct
 class DataRecordSet:
     """
-    A DataRecordSet contains a list of DataRecords that share the same schema, same parent_id, and same source_idx.
+    A DataRecordSet contains a list of DataRecords that share the same schema, same parent(s), and same source(s).
     We explicitly check that this is True.
@@ -305,20 +394,22 @@ class DataRecordSet:
             data_records: list[DataRecord],
             record_op_stats: list[RecordOpStats],
             field_to_score_fn: dict[str, str | callable] | None = None,
+            input: int | DataRecord | list[DataRecord] | tuple[list[DataRecord]] | None = None,
         ):
-        # check that all data_records are derived from the same parent record
-        if len(data_records) > 0:
-            parent_id = data_records[0].parent_id
-            error_msg = "DataRecordSet must be constructed from the output of executing a single operator on a single input."
-            assert all([dr.parent_id == parent_id for dr in data_records]), error_msg
-        # set data_records, parent_id, and source_idx; note that it is possible for
+        # set data_records, parent_ids, and source_indices; note that it is possible for
         # data_records to be an empty list in the event of a failed convert
         self.data_records = data_records
-        self.parent_id = data_records[0].parent_id if len(data_records) > 0 else None
-        self.source_idx = data_records[0].source_idx if len(data_records) > 0 else None
+        self.parent_ids = data_records[0].parent_ids if len(data_records) > 0 else None
+        self.source_indices = data_records[0].source_indices if len(data_records) > 0 else None
         self.schema = data_records[0].schema if len(data_records) > 0 else None
+        # the input to the operator which produced the data_records; type is tuple[DataRecord] | tuple[int]
+        # - for scan operators, input is a singleton tuple[int] which wraps the source_idx, e.g.: (source_idx,)
+        # - for join operators, input is a tuple with one entry for the left input DataRecord and one entry for the right input DataRecord
+        # - for aggregate operators, input is a tuple with all the input DataRecords to the aggregation
+        # - for all other operaotrs, input is a singleton tuple[DataRecord] which wraps the single input
+        self.input = input
         # set statistics for generating these records
         self.record_op_stats = record_op_stats
@@ -350,7 +441,7 @@ class DataRecordCollection:
     The difference between DataRecordSet and DataRecordCollection
     Goal:
-        DataRecordSet is a set of DataRecords that share the same schema, same parent_id, and same source_idx.
+        DataRecordSet is a set of DataRecords that share the same schema, same parents, and same sources.
         DataRecordCollection is a general wrapper for list[DataRecord].
     Usage:

palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl

palimpzest 0.7.21py3-none-any.whl → 0.8.1py3-none-any.whl