PyPI - palimpzest - Versions diffs - 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/METADATA +19 -9
palimpzest-0.7.1.dist-info/RECORD +96 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.4.dist-info/RECORD +0 -87
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.1.dist-info}/top_level.txt +0 -0

palimpzest/query/processor/streaming_processor.py CHANGED Viewed

@@ -1,8 +1,8 @@
+import logging
 import time
-from palimpzest.core.data.dataclasses import OperatorStats, PlanStats
+from palimpzest.core.data.dataclasses import PlanStats
 from palimpzest.core.elements.records import DataRecordCollection
-from palimpzest.policy import Policy
 from palimpzest.query.operators.aggregate import AggregateOp
 from palimpzest.query.operators.filter import FilterOp
 from palimpzest.query.operators.limit import LimitScanOp
@@ -11,6 +11,7 @@ from palimpzest.query.optimizer.plan import PhysicalPlan
 from palimpzest.query.processor.query_processor import QueryProcessor
 from palimpzest.sets import Dataset
+logger = logging.getLogger(__name__)
 class StreamingQueryProcessor(QueryProcessor):
     """This class can be used for a streaming, record-based execution.
@@ -24,6 +25,7 @@ class StreamingQueryProcessor(QueryProcessor):
         self.current_scan_idx: int = 0
         self.plan_generated: bool = False
         self.records_count: int = 0
+        logger.info("Initialized StreamingQueryProcessor")
     @property
     def plan(self) -> PhysicalPlan:
@@ -45,33 +47,32 @@ class StreamingQueryProcessor(QueryProcessor):
     def plan_stats(self, plan_stats: PlanStats):
         self._plan_stats = plan_stats
-    def generate_plan(self, dataset: Dataset, policy: Policy):
+    def generate_plan(self, dataset: Dataset):
         # self.clear_cached_examples()
         start_time = time.time()
+        # check that the plan does not contain any aggregation operators
+        for op in self.plan.operators:
+            if isinstance(op, AggregateOp):
+                raise Exception("You cannot have a Streaming Execution if there is an Aggregation Operator")
         # TODO: Do we need to re-initialize the optimizer here?
         # Effectively always use the optimal strategy
         optimizer = self.optimizer.deepcopy_clean()
-        plans = optimizer.optimize(dataset, policy)
+        plans = optimizer.optimize(dataset)
         self.plan = plans[0]
-        self.plan_stats = PlanStats(plan_id=self.plan.plan_id)
-        for op in self.plan.operators:
-            if isinstance(op, AggregateOp):
-                raise Exception("You cannot have a Streaming Execution if there is an Aggregation Operator")
-            op_id = op.get_op_id()
-            op_name = op.op_name()
-            op_details = {k: str(v) for k, v in op.get_id_params().items()}
-            self.plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details)
-        print("Time for planning: ", time.time() - start_time)
+        self.plan_stats = PlanStats.from_plan(self.plan)
+        self.plan_stats.start()
+        logger.info(f"Time for planning: {time.time() - start_time:.2f} seconds")
         self.plan_generated = True
-        print("Generated plan:\n", self.plan)
+        logger.info(f"Generated plan:\n{self.plan}")
         return self.plan
     def execute(self):
-        start_time = time.time()
+        logger.info("Executing StreamingQueryProcessor")
         # Always delete cache
         if not self.plan_generated:
-            self.generate_plan(self.dataset, self.policy)
+            self.generate_plan(self.dataset)
         # if dry_run:
         #     yield [], self.plan, self.plan_stats
@@ -82,11 +83,14 @@ class StreamingQueryProcessor(QueryProcessor):
             # print("Iteration number: ", idx+1, "out of", len(input_records))
             output_records = self.execute_opstream(self.plan, record)
             if idx == len(input_records) - 1:
-                total_plan_time = time.time() - start_time
-                self.plan_stats.finalize(total_plan_time)
+                # finalize plan stats
+                self.plan_stats.finish()
             self.plan_stats.plan_str = str(self.plan)
             yield DataRecordCollection(output_records, plan_stats=self.plan_stats)
+        logger.info("Done executing StreamingQueryProcessor")
     def get_input_records(self):
         scan_operator = self.plan.operators[0]
         assert isinstance(scan_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
@@ -102,12 +106,7 @@ class StreamingQueryProcessor(QueryProcessor):
             input_records += record_set.data_records
             record_op_stats += record_set.record_op_stats
-        op_id = scan_operator.get_op_id()
-        self.plan_stats.operator_stats[op_id].add_record_op_stats(
-            record_op_stats,
-            source_op_id=None,
-            plan_id=self.plan.plan_id,
-        )
+        self.plan_stats.add_record_op_stats(record_op_stats)
         return input_records
@@ -116,13 +115,11 @@ class StreamingQueryProcessor(QueryProcessor):
         input_records = [record]
         record_op_stats_lst = []
-        for op_idx, operator in enumerate(plan.operators):
+        for operator in plan.operators:
             # TODO: this being defined in the for loop potentially makes the return
             # unbounded if plan.operators is empty. This should be defined outside the loop
             # and the loop refactored to account for not redeclaring this for each operator
             output_records = []
-            op_id = operator.get_op_id()
-            prev_op_id = plan.operators[op_idx - 1].get_op_id() if op_idx > 1 else None
             if isinstance(operator, ScanPhysicalOp):
                 continue
@@ -145,11 +142,7 @@ class StreamingQueryProcessor(QueryProcessor):
                     if not output_records:
                         break
-            self.plan_stats.operator_stats[op_id].add_record_op_stats(
-                record_op_stats_lst,
-                source_op_id=prev_op_id,
-                plan_id=plan.plan_id,
-            )
+            self.plan_stats.add_record_op_stats(record_op_stats_lst)
             input_records = output_records
             self.records_count += len(output_records)

palimpzest/sets.py CHANGED Viewed

@@ -4,18 +4,18 @@ from pathlib import Path
 from typing import Callable
 import pandas as pd
+from chromadb.api.models.Collection import Collection
+from ragatouille.RAGPretrainedModel import RAGPretrainedModel
 from palimpzest.constants import AggFunc, Cardinality
 from palimpzest.core.data.datareaders import DataReader
 from palimpzest.core.elements.filters import Filter
 from palimpzest.core.elements.groupbysig import GroupBySig
-from palimpzest.core.lib.fields import ListField, StringField
 from palimpzest.core.lib.schemas import Number, Schema
 from palimpzest.policy import construct_policy_from_kwargs
 from palimpzest.query.processor.config import QueryProcessorConfig
 from palimpzest.utils.datareader_helpers import get_local_datareader
 from palimpzest.utils.hash_helpers import hash_for_serialized_dict
-from palimpzest.utils.index_helpers import get_index_str
 #####################################################
@@ -35,15 +35,15 @@ class Set:
         agg_func: AggFunc | None = None,
         group_by: GroupBySig | None = None,
         project_cols: list[str] | None = None,
-        index=None,  # TODO(Siva): Abstract Index and add a type here and elsewhere
+        index: Collection | RAGPretrainedModel | None = None,
         search_func: Callable | None = None,
         search_attr: str | None = None,
-        output_attr: str | None = None,
+        output_attrs: list[dict] | None = None,
         k: int | None = None,  # TODO: disambiguate `k` to be something like `retrieve_k`
         limit: int | None = None,
         cardinality: Cardinality = Cardinality.ONE_TO_ONE,
         depends_on: list[str] | None = None,
-        nocache: bool = False,
+        cache: bool = False,
     ):
         self._schema = schema
         self._source = source
@@ -56,12 +56,12 @@ class Set:
         self._index = index
         self._search_func = search_func
         self._search_attr = search_attr
-        self._output_attr = output_attr
+        self._output_attrs = output_attrs
         self._k = k
         self._limit = limit
         self._cardinality = cardinality
         self._depends_on = [] if depends_on is None else sorted(depends_on)
-        self._nocache = nocache
+        self._cache = cache
     @property
     def schema(self) -> Schema:
@@ -83,16 +83,16 @@ class Set:
             "source": self._source.serialize(),
             "desc": repr(self._desc),
             "filter": None if self._filter is None else self._filter.serialize(),
-            "udf": None if self._udf is None else str(self._udf),
+            "udf": None if self._udf is None else self._udf.__name__,
             "agg_func": None if self._agg_func is None else self._agg_func.value,
             "cardinality": self._cardinality,
             "limit": self._limit,
-            "group_by": (None if self._group_by is None else self._group_by.serialize()),
-            "project_cols": (None if self._project_cols is None else self._project_cols),
-            "index": None if self._index is None else get_index_str(self._index),
-            "search_func": None if self._search_func is None else str(self._search_func),
+            "group_by": None if self._group_by is None else self._group_by.serialize(),
+            "project_cols": None if self._project_cols is None else self._project_cols,
+            "index": None if self._index is None else self._index.__class__.__name__,
+            "search_func": None if self._search_func is None else self._search_func.__name__,
             "search_attr": self._search_attr,
-            "output_attr": self._output_attr,
+            "output_attrs": None if self._output_attrs is None else str(self._output_attrs),
             "k": self._k,
         }
@@ -132,10 +132,31 @@ class Dataset(Set):
         # get the schema
         schema = updated_source.schema if schema is None else schema
         # intialize class
         super().__init__(updated_source, schema, *args, **kwargs)
+    def copy(self):
+        return Dataset(
+            source=self._source.copy() if isinstance(self._source, Set) else self._source,
+            schema=self._schema,
+            desc=self._desc,
+            filter=self._filter,
+            udf=self._udf,
+            agg_func=self._agg_func,
+            group_by=self._group_by,
+            project_cols=self._project_cols,
+            index=self._index,
+            search_func=self._search_func,
+            search_attr=self._search_attr,
+            output_attrs=self._output_attrs,
+            k=self._k,
+            limit=self._limit,
+            cardinality=self._cardinality,
+            depends_on=self._depends_on,
+            cache=self._cache,
+        )
     def filter(
         self,
         _filter: Callable,
@@ -159,9 +180,9 @@ class Dataset(Set):
             schema=self.schema,
             filter=f,
             depends_on=depends_on,
-            nocache=self._nocache,
+            cache=self._cache,
         )
     def sem_filter(
         self,
         _filter: str,
@@ -173,7 +194,7 @@ class Dataset(Set):
             f = Filter(_filter)
         else:
             raise Exception("sem_filter() only supports `str` input for _filter.", type(_filter))
         if isinstance(depends_on, str):
             depends_on = [depends_on]
@@ -182,11 +203,11 @@ class Dataset(Set):
             schema=self.schema,
             filter=f,
             depends_on=depends_on,
-            nocache=self._nocache,
+            cache=self._cache,
         )
     def sem_add_columns(self, cols: list[dict] | type[Schema],
-                        cardinality: Cardinality = Cardinality.ONE_TO_ONE,
+                        cardinality: Cardinality = Cardinality.ONE_TO_ONE,
                         depends_on: str | list[str] | None = None,
                         desc: str = "Add new columns via semantic reasoning") -> Dataset:
         """
@@ -217,7 +238,7 @@ class Dataset(Set):
             cardinality=cardinality,
             depends_on=depends_on,
             desc=desc,
-            nocache=self._nocache,
+            cache=self._cache,
         )
     def add_columns(self, udf: Callable,
@@ -254,7 +275,7 @@ class Dataset(Set):
                 col_dict["desc"] = col_dict.get("desc", "New column: " + col_dict["name"])
                 updated_cols.append(col_dict)
             new_output_schema = self.schema.add_fields(updated_cols)
         elif issubclass(cols, Schema):
             new_output_schema = self.schema.union(cols)
@@ -268,7 +289,24 @@ class Dataset(Set):
             cardinality=cardinality,
             desc=desc,
             depends_on=depends_on,
-            nocache=self._nocache,
+            cache=self._cache,
+        )
+    def map(self, udf: Callable) -> Dataset:
+        """
+        Apply a UDF map function.
+        Examples:
+            map(udf=clean_column_values)
+        """
+        if udf is None:
+            raise ValueError("`udf` must be provided for map.")
+        return Dataset(
+            source=self,
+            schema=self.schema,
+            udf=udf,
+            cache=self._cache,
         )
     def count(self) -> Dataset:
@@ -278,7 +316,7 @@ class Dataset(Set):
             schema=Number,
             desc="Count results",
             agg_func=AggFunc.COUNT,
-            nocache=self._nocache,
+            cache=self._cache,
         )
     def average(self) -> Dataset:
@@ -288,7 +326,7 @@ class Dataset(Set):
             schema=Number,
             desc="Average results",
             agg_func=AggFunc.AVERAGE,
-            nocache=self._nocache,
+            cache=self._cache,
         )
     def groupby(self, groupby: GroupBySig) -> Dataset:
@@ -297,34 +335,43 @@ class Dataset(Set):
             schema=groupby.output_schema(),
             desc="Group By",
             group_by=groupby,
-            nocache=self._nocache,
+            cache=self._cache,
         )
     def retrieve(
-        self, index, search_func: Callable, search_attr: str, output_attr: str, output_attr_desc: str, k=-1
+        self,
+        index: Collection | RAGPretrainedModel,
+        search_attr: str,
+        output_attrs: list[dict] | type[Schema],
+        search_func: Callable | None = None,
+        k: int = -1,
     ) -> Dataset:
         """
-        Retrieve the top k nearest neighbors of the value of the `search_attr` from the index and
-        stores it in the `output_attr` field. The output schema is a union of the current schema
-        and the `output_attr` with type ListField(StringField). `search_func` is a function of
-        type (index, query: str | list(str), k: int) -> list[str]. It should implement the lookup
-        logic for the index and return the top k results. The value of the `search_attr` field is
-        used as the query to lookup in the index. The results are stored in the `output_attr`
-        field. `output_attr_desc` is the description of the `output_attr` field.
+        Retrieve the top-k nearest neighbors of the value of the `search_attr` from the `index` and
+        use these results to construct the `output_attrs` field(s).
         """
-        # Output schema is a union of the current schema and the output_attr
-        attributes = {output_attr: ListField(StringField)(desc=output_attr_desc)}
-        output_schema = self.schema().union(type("Schema", (Schema,), attributes))
+        new_output_schema = None
+        if isinstance(output_attrs, list):
+            new_output_schema = self.schema.add_fields(output_attrs)
+        elif issubclass(output_attrs, Schema):
+            new_output_schema = self.schema.union(output_attrs)
+        else:
+            raise ValueError("`cols` must be a list of dictionaries or a Schema.")
+        # TODO: revisit once we can think through abstraction(s)
+        # # construct the PZIndex from the user-provided index
+        # index = index_factory(index)
         return Dataset(
             source=self,
-            schema=output_schema,
+            schema=new_output_schema,
             desc="Retrieve",
             index=index,
             search_func=search_func,
             search_attr=search_attr,
-            output_attr=output_attr,
+            output_attrs=output_attrs,
             k=k,
-            nocache=self._nocache,
+            cache=self._cache,
         )
     def limit(self, n: int) -> Dataset:
@@ -334,7 +381,7 @@ class Dataset(Set):
             schema=self.schema,
             desc="LIMIT " + str(n),
             limit=n,
-            nocache=self._nocache,
+            cache=self._cache,
         )
     def project(self, project_cols: list[str] | str) -> Dataset:
@@ -343,7 +390,7 @@ class Dataset(Set):
             source=self,
             schema=self.schema.project(project_cols),
             project_cols=project_cols if isinstance(project_cols, list) else [project_cols],
-            nocache=self._nocache,
+            cache=self._cache,
         )
     def run(self, config: QueryProcessorConfig | None = None, **kwargs):

palimpzest/utils/model_helpers.py CHANGED Viewed

@@ -26,7 +26,7 @@ def get_models(include_vision: bool = False) -> list[Model]:
         models.extend([Model.GPT_4o, Model.GPT_4o_MINI])
     if os.getenv("TOGETHER_API_KEY") is not None:
-        models.extend([Model.LLAMA3, Model.MIXTRAL])
+        models.extend([Model.LLAMA3, Model.MIXTRAL, Model.DEEPSEEK])
     if include_vision:
         vision_models = get_vision_models()
@@ -39,23 +39,24 @@ TEXT_MODEL_PRIORITY = [
     Model.GPT_4o,
     Model.GPT_4o_MINI,
     Model.LLAMA3,
-    Model.MIXTRAL
+    Model.MIXTRAL,
+    Model.DEEPSEEK,
 ]
 VISION_MODEL_PRIORITY = [
     Model.GPT_4o_V,
     Model.GPT_4o_MINI_V,
-    Model.LLAMA3_V
+    Model.LLAMA3_V,
 ]
-def get_champion_model(available_models, vision=False):
+def get_champion_model(available_models, vision=False):
     # Select appropriate priority list based on task
     model_priority = VISION_MODEL_PRIORITY if vision else TEXT_MODEL_PRIORITY
     # Return first available model from priority list
     for model in model_priority:
         if model in available_models:
             return model
     # If no suitable model found, raise informative error
     task_type = "vision" if vision else "text"
     raise Exception(
@@ -66,7 +67,7 @@ def get_champion_model(available_models, vision=False):
     )
-def get_conventional_fallback_model(available_models, vision=False):
+def get_fallback_model(available_models, vision=False):
     return get_champion_model(available_models, vision)

palimpzest 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl

palimpzest 0.6.4py3-none-any.whl → 0.7.1py3-none-any.whl