PyPI - palimpzest - Versions diffs - 0.9.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

palimpzest 0.9.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

palimpzest/constants.py +1 -0
palimpzest/core/data/dataset.py +33 -5
palimpzest/core/elements/groupbysig.py +10 -1
palimpzest/core/elements/records.py +16 -7
palimpzest/core/lib/schemas.py +20 -3
palimpzest/core/models.py +10 -4
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +13 -11
palimpzest/query/execution/mab_execution_strategy.py +40 -14
palimpzest/query/execution/parallel_execution_strategy.py +31 -7
palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
palimpzest/query/generators/generators.py +1 -1
palimpzest/query/operators/__init__.py +7 -6
palimpzest/query/operators/aggregate.py +110 -5
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/join.py +279 -23
palimpzest/query/operators/logical.py +20 -8
palimpzest/query/operators/mixture_of_agents.py +3 -1
palimpzest/query/operators/physical.py +5 -2
palimpzest/query/operators/rag.py +5 -4
palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
palimpzest/query/optimizer/__init__.py +7 -3
palimpzest/query/optimizer/cost_model.py +5 -5
palimpzest/query/optimizer/optimizer.py +3 -2
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/rules.py +31 -11
palimpzest/query/optimizer/tasks.py +4 -4
palimpzest/query/processor/config.py +1 -0
palimpzest/utils/progress.py +51 -23
palimpzest/validator/validator.py +7 -7
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/METADATA +26 -66
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/RECORD +35 -35
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/WHEEL +0 -0
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.9.0.dist-info → palimpzest-1.1.0.dist-info}/top_level.txt +0 -0

palimpzest/query/generators/generators.py CHANGED Viewed

@@ -338,7 +338,7 @@ class Generator(Generic[ContextType, InputType]):
                     reasoning_effort = "minimal" if self.reasoning_effort is None else self.reasoning_effort
                     completion_kwargs = {"reasoning_effort": reasoning_effort, **completion_kwargs}
             if self.model.is_vllm_model():
-                completion_kwargs = {"api_base": self.api_base, "api_key": os.environ.get("VLLM_API_KEY", "fake-api-key") **completion_kwargs}
+                completion_kwargs = {"api_base": self.api_base, "api_key": os.environ.get("VLLM_API_KEY", "fake-api-key"), **completion_kwargs}
             completion = litellm.completion(model=self.model_name, messages=messages, **completion_kwargs)
             end_time = time.time()
             logger.debug(f"Generated completion in {end_time - start_time:.2f} seconds")

palimpzest/query/operators/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ from palimpzest.query.operators.aggregate import CountAggregateOp as _CountAggre
 from palimpzest.query.operators.aggregate import MaxAggregateOp as _MaxAggregateOp
 from palimpzest.query.operators.aggregate import MinAggregateOp as _MinAggregateOp
 from palimpzest.query.operators.aggregate import SemanticAggregate as _SemanticAggregate
+from palimpzest.query.operators.aggregate import SumAggregateOp as _SumAggregateOp
 from palimpzest.query.operators.convert import ConvertOp as _ConvertOp
 from palimpzest.query.operators.convert import LLMConvert as _LLMConvert
 from palimpzest.query.operators.convert import LLMConvertBonded as _LLMConvertBonded
@@ -50,7 +51,7 @@ from palimpzest.query.operators.logical import (
     Project as _Project,
 )
 from palimpzest.query.operators.logical import (
-    RetrieveScan as _RetrieveScan,
+    TopKScan as _TopKScan,
 )
 from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsConvert as _MixtureOfAgentsConvert
 from palimpzest.query.operators.mixture_of_agents import MixtureOfAgentsFilter as _MixtureOfAgentsFilter
@@ -58,11 +59,11 @@ from palimpzest.query.operators.physical import PhysicalOperator as _PhysicalOpe
 from palimpzest.query.operators.project import ProjectOp as _ProjectOp
 from palimpzest.query.operators.rag import RAGConvert as _RAGConvert
 from palimpzest.query.operators.rag import RAGFilter as _RAGFilter
-from palimpzest.query.operators.retrieve import RetrieveOp as _RetrieveOp
 from palimpzest.query.operators.scan import MarshalAndScanDataOp as _MarshalAndScanDataOp
 from palimpzest.query.operators.scan import ScanPhysicalOp as _ScanPhysicalOp
 from palimpzest.query.operators.split import SplitConvert as _SplitConvert
 from palimpzest.query.operators.split import SplitFilter as _SplitFilter
+from palimpzest.query.operators.topk import TopKOp as _TopKOp
 LOGICAL_OPERATORS = [
     _LogicalOperator,
@@ -75,12 +76,12 @@ LOGICAL_OPERATORS = [
     _LogicalJoinOp,
     _LimitScan,
     _Project,
-    _RetrieveScan,
+    _TopKScan,
 ]
 PHYSICAL_OPERATORS = (
     # aggregate
-    [_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp, _MaxAggregateOp, _MinAggregateOp, _SemanticAggregate]
+    [_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp, _MaxAggregateOp, _MinAggregateOp, _SemanticAggregate, _SumAggregateOp]
     # convert
     + [_ConvertOp, _NonLLMConvert, _LLMConvert, _LLMConvertBonded]
     # critique and refine
@@ -103,8 +104,8 @@ PHYSICAL_OPERATORS = (
     + [_ProjectOp]
     # rag
     + [_RAGConvert, _RAGFilter]
-    # retrieve
-    + [_RetrieveOp]
+    # top-k
+    + [_TopKOp]
     # split
     + [_SplitConvert, _SplitFilter]
 )

palimpzest/query/operators/aggregate.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import contextlib
 import time
 from typing import Any
@@ -14,7 +15,7 @@ from palimpzest.constants import (
 )
 from palimpzest.core.elements.groupbysig import GroupBySig
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
-from palimpzest.core.lib.schemas import Average, Count, Max, Min
+from palimpzest.core.lib.schemas import Average, Count, Max, Min, Sum
 from palimpzest.core.models import OperatorCostEstimates, RecordOpStats
 from palimpzest.query.generators.generators import Generator
 from palimpzest.query.operators.physical import PhysicalOperator
@@ -68,6 +69,16 @@ class ApplyGroupByOp(AggregateOp):
             return 0
         elif func.lower() == "average":
             return (0, 0)
+        elif func.lower() == "sum":
+            return 0
+        elif func.lower() == "min":
+            return float("inf")
+        elif func.lower() == "max":
+            return float("-inf")
+        elif func.lower() == "list":
+            return []
+        elif func.lower() == "set":
+            return set()
         else:
             raise Exception("Unknown agg function " + func)
@@ -76,16 +87,34 @@ class ApplyGroupByOp(AggregateOp):
         if func.lower() == "count":
             return state + 1
         elif func.lower() == "average":
-            sum, cnt = state
+            sum_, cnt = state
+            if val is None:
+                return (sum_, cnt)
+            return (sum_ + val, cnt + 1)
+        elif func.lower() == "sum":
+            if val is None:
+                return state
+            return state + sum(val) if isinstance(val, list) else state + val
+        elif func.lower() == "min":
+            if val is None:
+                return state
+            return min(state, min(val) if isinstance(val, list) else val)
+        elif func.lower() == "max":
             if val is None:
-                return (sum, cnt)
-            return (sum + val, cnt + 1)
+                return state
+            return max(state, max(val) if isinstance(val, list) else val)
+        elif func.lower() == "list":
+            state.append(val)
+            return state
+        elif func.lower() == "set":
+            state.add(val)
+            return state
         else:
             raise Exception("Unknown agg function " + func)
     @staticmethod
     def agg_final(func, state):
-        if func.lower() == "count":
+        if func.lower() in ["count", "sum", "min", "max", "list", "set"]:
             return state
         elif func.lower() == "average":
             sum, cnt = state
@@ -240,6 +269,82 @@ class AverageAggregateOp(AggregateOp):
         return DataRecordSet([dr], [record_op_stats])
+class SumAggregateOp(AggregateOp):
+    # NOTE: we don't actually need / use agg_func here (yet)
+    def __init__(self, agg_func: AggFunc, *args, **kwargs):
+        # enforce that output schema is correct
+        assert kwargs["output_schema"].model_fields.keys() == Sum.model_fields.keys(), "SumAggregateOp requires output_schema to be Sum"
+        # enforce that input schema is a single numeric field
+        input_field_types = list(kwargs["input_schema"].model_fields.values())
+        assert len(input_field_types) == 1, "SumAggregateOp requires input_schema to have exactly one field"
+        numeric_field_types = [
+            bool, int, float, int | float,
+            bool | None, int | None, float | None, int | float | None,
+            bool | Any, int | Any, float | Any, int | float | Any,
+            bool | None | Any, int | None | Any, float | None | Any, int | float | None | Any,
+        ]
+        is_numeric = input_field_types[0].annotation in numeric_field_types
+        assert is_numeric, f"SumAggregateOp requires input_schema to have a numeric field type, i.e. one of: {numeric_field_types}\nGot: {input_field_types[0]}"
+        # call parent constructor
+        super().__init__(*args, **kwargs)
+        self.agg_func = agg_func
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Function: {str(self.agg_func)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        return {"agg_func": str(self.agg_func), **id_params}
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"agg_func": self.agg_func, **op_params}
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        # for now, assume applying the aggregation takes negligible additional time (and no cost in USD)
+        return OperatorCostEstimates(
+            cardinality=1,
+            time_per_record=0,
+            cost_per_record=0,
+            quality=1.0,
+        )
+    def __call__(self, candidates: list[DataRecord]) -> DataRecordSet:
+        start_time = time.time()
+        # NOTE: we currently do not guarantee that input values conform to their specified type;
+        #       as a result, we simply omit any values which do not parse to a float from the average
+        # NOTE: right now we perform a check in the constructor which enforces that the input_schema
+        #       has a single field which is numeric in nature; in the future we may want to have a
+        #       cleaner way of computing the value (rather than `float(list(candidate...))` below)
+        summation = 0
+        for candidate in candidates:
+            with contextlib.suppress(Exception):
+                summation += float(list(candidate.to_dict().values())[0])
+        data_item = Sum(sum=summation)
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=0.0,
+        )
+        return DataRecordSet([dr], [record_op_stats])
 class CountAggregateOp(AggregateOp):
     # NOTE: we don't actually need / use agg_func here (yet)

palimpzest/query/operators/convert.py CHANGED Viewed

@@ -320,7 +320,7 @@ class LLMConvert(ConvertOp):
         est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
         # get est. of conversion time per record from model card;
-        model_name = self.model.value if getattr(self, "model", None) is not None else Model.GPT_4o_MINI.value
+        model_name = self.model.value
         model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
         # get est. of conversion cost (in USD) per record from model card

palimpzest 0.9.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

palimpzest 0.9.0py3-none-any.whl → 1.1.0py3-none-any.whl