PyPI - palimpzest - Versions diffs - 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

palimpzest 0.7.21py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +259 -197
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +634 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +61 -5
palimpzest/prompts/filter_prompts.py +50 -5
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
palimpzest/prompts/prompt_factory.py +358 -46
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +157 -330
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +27 -21
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +22 -13
palimpzest/query/operators/join.py +402 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +198 -80
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +41 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +27 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
palimpzest-0.8.0.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.21.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0

palimpzest/query/execution/execution_strategy.py CHANGED Viewed

@@ -1,22 +1,24 @@
 import logging
 from abc import ABC, abstractmethod
-from concurrent.futures import ThreadPoolExecutor, wait
+from concurrent.futures import ThreadPoolExecutor, as_completed
 import numpy as np
 from chromadb.api.models.Collection import Collection
-from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
-from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanStats, RecordOpStats
-from palimpzest.core.data.datareaders import DataReader
+from palimpzest.constants import Cardinality
+from palimpzest.core.data.dataset import Dataset
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.models import GenerationStats, PlanStats, SentinelPlanStats
 from palimpzest.policy import Policy
 from palimpzest.query.operators.convert import LLMConvert
-from palimpzest.query.operators.filter import FilterOp, LLMFilter
+from palimpzest.query.operators.filter import LLMFilter
+from palimpzest.query.operators.join import JoinOp
 from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.query.operators.retrieve import RetrieveOp
-from palimpzest.query.operators.scan import ScanPhysicalOp
+from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
 from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan
 from palimpzest.utils.progress import PZSentinelProgressManager
+from palimpzest.validator.validator import Validator
 logger = logging.getLogger(__name__)
@@ -24,35 +26,20 @@ class BaseExecutionStrategy:
     def __init__(self,
                  scan_start_idx: int = 0,
                  max_workers: int | None = None,
+                 batch_size: int | None = None,
                  num_samples: int | None = None,
-                 cache: bool = False,
                  verbose: bool = False,
                  progress: bool = True,
                  *args,
                  **kwargs):
         self.scan_start_idx = scan_start_idx
         self.max_workers = max_workers
+        self.batch_size = batch_size
         self.num_samples = num_samples
-        self.cache = cache
         self.verbose = verbose
         self.progress = progress
-    def _add_records_to_cache(self, target_cache_id: str, records: list[DataRecord]) -> None:
-        """Add each record (which isn't filtered) to the cache for the given target_cache_id."""
-        if self.cache:
-            for record in records:
-                if getattr(record, "passed_operator", True):
-                    # self.datadir.append_cache(target_cache_id, record)
-                    pass
-    def _close_cache(self, target_cache_ids: list[str]) -> None:
-        """Close the cache for each of the given target_cache_ids"""
-        if self.cache:
-            for target_cache_id in target_cache_ids:  # noqa: B007
-                # self.datadir.close_cache(target_cache_id)
-                pass
 class ExecutionStrategy(BaseExecutionStrategy, ABC):
     """Base strategy for executing query plans. Defines how to execute a PhysicalPlan.
     """
@@ -66,19 +53,24 @@ class ExecutionStrategy(BaseExecutionStrategy, ABC):
         """Execute a single plan according to strategy"""
         pass
-    def _create_input_queues(self, plan: PhysicalPlan) -> dict[str, list]:
+    def _create_input_queues(self, plan: PhysicalPlan) -> dict[str, dict[str, list]]:
         """Initialize input queues for each operator in the plan."""
-        input_queues = {}
-        for op in plan.operators:
-            inputs = []
+        input_queues = {f"{topo_idx}-{op.get_full_op_id()}": {} for topo_idx, op in enumerate(plan)}
+        for topo_idx, op in enumerate(plan):
+            full_op_id = op.get_full_op_id()
+            unique_op_id = f"{topo_idx}-{full_op_id}"
             if isinstance(op, ScanPhysicalOp):
                 scan_end_idx = (
-                    len(op.datareader)
+                    len(op.datasource)
                     if self.num_samples is None
-                    else min(self.scan_start_idx + self.num_samples, len(op.datareader))
+                    else min(self.scan_start_idx + self.num_samples, len(op.datasource))
                 )
-                inputs = [idx for idx in range(self.scan_start_idx, scan_end_idx)]
-            input_queues[op.get_full_op_id()] = inputs
+                input_queues[unique_op_id][f"source_{full_op_id}"] = [idx for idx in range(self.scan_start_idx, scan_end_idx)]
+            elif isinstance(op, ContextScanOp):
+                input_queues[unique_op_id][f"source_{full_op_id}"] = [None]
+            else:
+                for source_unique_full_op_id in plan.get_source_unique_full_op_ids(topo_idx, op):
+                    input_queues[unique_op_id][source_unique_full_op_id] = []
         return input_queues
@@ -90,7 +82,6 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
     """
     def __init__(
         self,
-        val_datasource: DataReader,
         k: int,
         j: int,
         sample_budget: int,
@@ -103,7 +94,6 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
-        self.val_datasource = val_datasource
         self.k = k
         self.j = j
         self.sample_budget = sample_budget
@@ -114,292 +104,200 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         self.rng = np.random.default_rng(seed=seed)
         self.exp_name = exp_name
-        # special cache which is used for tracking the target record sets for each (source_idx, logical_op_id)
-        self.champion_output_cache: dict[int, dict[str, tuple[DataRecordSet, float]]] = {}
         # general cache which maps hash(logical_op_id, phys_op_id, hash(input)) --> record_set
         self.cache: dict[int, DataRecordSet] = {}
         # progress manager used to track progress of the execution
         self.progress_manager: PZSentinelProgressManager | None = None
-    def _compute_quality(
-            self,
-            physical_op_cls: type[PhysicalOperator],
-            record_set: DataRecordSet,
-            target_record_set: DataRecordSet,
-        ) -> DataRecordSet:
-        """
-        Compute the quality for the given `record_set` by comparing it to the `target_record_set`.
-        Update the record_set by assigning the quality to each entry in its record_op_stats and
-        returning the updated record_set.
-        """
-        # if this operation failed
-        if len(record_set) == 0:
-            record_set.record_op_stats[0].quality = 0.0
-        # if this operation is a filter:
-        # - return 1.0 if there's a match in the expected output which this operator does not filter out and 0.0 otherwise
-        elif issubclass(physical_op_cls, FilterOp):
-            # NOTE: we know that record_set.data_records will contain a single entry for a filter op
-            record = record_set.data_records[0]
-            # search for a record in the target with the same set of fields
-            found_match_in_target = False
-            for target_record in target_record_set:
-                all_correct = True
-                for field, value in record.field_values.items():
-                    if value != target_record[field]:
-                        all_correct = False
-                        break
-                if all_correct:
-                    found_match_in_target = target_record.passed_operator
-                    break
-            # set quality based on whether we found a match in the target and return
-            record_set.record_op_stats[0].quality = int(record.passed_operator == found_match_in_target)
-            return record_set
-        # if this is a successful convert operation
-        else:
-            # NOTE: the following computation assumes we do not project out computed values
-            #       (and that the validation examples provide all computed fields); even if
-            #       a user program does add projection, we can ignore the projection on the
-            #       validation dataset and use the champion model (as opposed to the validation
-            #       output) for scoring fields which have their values projected out
-            # GREEDY ALGORITHM
-            # for each record in the expected output, we look for the computed record which maximizes the quality metric;
-            # once we've identified that computed record we remove it from consideration for the next expected output
-            field_to_score_fn = target_record_set.get_field_to_score_fn()
-            for target_record in target_record_set:
-                best_quality, best_record_op_stats = 0.0, None
-                for record_op_stats in record_set.record_op_stats:
-                    # if we already assigned this record a quality, skip it
-                    if record_op_stats.quality is not None:
-                        continue
-                    # compute number of matches between this record's computed fields and this expected record's outputs
-                    total_quality = 0
-                    for field in record_op_stats.generated_fields:
-                        computed_value = record_op_stats.record_state.get(field, None)
-                        expected_value = target_record[field]
-                        # get the metric function for this field
-                        score_fn = field_to_score_fn.get(field, "exact")
-                        # compute exact match
-                        if score_fn == "exact":
-                            total_quality += int(computed_value == expected_value)
-                        # compute UDF metric
-                        elif callable(score_fn):
-                            total_quality += score_fn(computed_value, expected_value)
-                        # otherwise, throw an exception
-                        else:
-                            raise Exception(f"Unrecognized score_fn: {score_fn}")
-                    # compute recall and update best seen so far
-                    quality = total_quality / len(record_op_stats.generated_fields)
-                    if quality > best_quality:
-                        best_quality = quality
-                        best_record_op_stats = record_op_stats
-                # set best_quality as quality for the best_record_op_stats
-                if best_record_op_stats is not None:
-                    best_record_op_stats.quality = best_quality
-        # for any records which did not receive a quality, set it to 0.0 as these are unexpected extras
-        for record_op_stats in record_set.record_op_stats:
-            if record_op_stats.quality is None:
-                record_op_stats.quality = 0.0
-        return record_set
     def _score_quality(
         self,
-        physical_op_cls: type[PhysicalOperator],
-        source_idx_to_record_sets: dict[int, list[DataRecordSet]],
-        source_idx_to_target_record_set: dict[int, DataRecordSet],
-    ) -> dict[int, list[DataRecordSet]]:
-        """
-        NOTE: This approach to cost modeling does not work directly for aggregation queries;
-              for these queries, we would ask the user to provide validation data for the step immediately
-              before a final aggregation
-        NOTE: This function currently assumes that one-to-many converts do NOT create duplicate outputs.
-        This assumption would break if, for example, we extracted the breed of every dog in an image.
-        If there were two golden retrievers and a bernoodle in an image and we extracted:
-            {"image": "file1.png", "breed": "Golden Retriever"}
-            {"image": "file1.png", "breed": "Golden Retriever"}
-            {"image": "file1.png", "breed": "Bernedoodle"}
-        This function would currently give perfect accuracy to the following output:
-            {"image": "file1.png", "breed": "Golden Retriever"}
-            {"image": "file1.png", "breed": "Bernedoodle"}
-        Even though it is missing one of the golden retrievers.
-        """
+        validator: Validator,
+        source_indices_to_record_sets: dict[tuple[str], list[tuple[DataRecordSet, PhysicalOperator]]],
+    ) -> tuple[dict[int, list[DataRecordSet]], GenerationStats]:
         # extract information about the logical operation performed at this stage of the sentinel plan;
         # NOTE: we can infer these fields from context clues, but in the long-term we should have a more
         #       principled way of getting these directly from attributes either stored in the sentinel_plan
         #       or in the PhysicalOperator
-        is_perfect_quality_op = (
-            not issubclass(physical_op_cls, LLMConvert)
-            and not issubclass(physical_op_cls, LLMFilter)
-            and not issubclass(physical_op_cls, RetrieveOp)
-        )
+        def is_perfect_quality_op(op: PhysicalOperator):
+            return (
+                not isinstance(op, LLMConvert)
+                and not isinstance(op, LLMFilter)
+                and not isinstance(op, RetrieveOp)
+                and not isinstance(op, JoinOp)
+            )
+        # create minimal set of futures necessary to compute quality of each output record
+        futures, full_hashes, full_hash_to_bool_output = [], set(), {}
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            for _, record_set_tuples in source_indices_to_record_sets.items():
+                for record_set, op in record_set_tuples:
+                    # if this operation does not involve an LLM, every record_op_stats object gets perfect quality
+                    if is_perfect_quality_op(op):
+                        for record_op_stats in record_set.record_op_stats:
+                            record_op_stats.quality = 1.0
+                        continue
-        # compute quality of each output computed by this operator
-        for source_idx, record_sets in source_idx_to_record_sets.items():
-            # if this operation does not involve an LLM, every record_op_stats object gets perfect quality
-            if is_perfect_quality_op:
-                for record_set in record_sets:
-                    for record_op_stats in record_set.record_op_stats:
-                        record_op_stats.quality = 1.0
-                continue
+                    # if the operation failed, assign 0.0 quality
+                    if len(record_set) == 0:
+                        record_set.record_op_stats[0].quality = 0.0
+                        continue
-            # extract target output for this record set
-            target_record_set = source_idx_to_target_record_set[source_idx]
+                    # create future for map
+                    if isinstance(op, LLMConvert) and op.cardinality is Cardinality.ONE_TO_ONE:
+                        fields = op.generated_fields
+                        input_record: DataRecord = record_set.input
+                        output = record_set.data_records[0].to_dict(project_cols=fields)
+                        output_str = record_set.data_records[0].to_json_str(project_cols=fields, bytes_to_str=True, sorted=True)
+                        full_hash = f"{hash(input_record)}{hash(output_str)}"
+                        if full_hash not in full_hashes:
+                            full_hashes.add(full_hash)
+                            futures.append(executor.submit(validator._score_map, op, fields, input_record, output, full_hash))
+                    # create future for flat map
+                    elif isinstance(op, LLMConvert) and op.cardinality is Cardinality.ONE_TO_MANY:
+                        fields = op.generated_fields
+                        input_record: DataRecord = record_set.input
+                        output, output_strs = [], []
+                        for data_record in record_set.data_records:
+                            output.append(data_record.to_dict(project_cols=fields))
+                            output_strs.append(data_record.to_json_str(project_cols=fields, bytes_to_str=True, sorted=True))
+                        full_hash = f"{hash(input_record)}{hash(tuple(sorted(output_strs)))}"
+                        if full_hash not in full_hashes:
+                            full_hashes.add(full_hash)
+                            futures.append(executor.submit(validator._score_flat_map, op, fields, input_record, output, full_hash))
+                    # create future for retrieve
+                    elif isinstance(op, RetrieveOp):
+                        fields = op.generated_fields
+                        input_record: DataRecord = record_set.input
+                        output = record_set.data_records[0].to_dict(project_cols=fields)
+                        output_str = record_set.data_records[0].to_json_str(project_cols=fields, bytes_to_str=True, sorted=True)
+                        full_hash = f"{hash(input_record)}{hash(output_str)}"
+                        if full_hash not in full_hashes:
+                            full_hashes.add(full_hash)
+                            futures.append(executor.submit(validator._score_retrieve, op, fields, input_record, output, full_hash))
+                    # create future for filter
+                    elif isinstance(op, LLMFilter):
+                        filter_str = op.filter_obj.filter_condition
+                        input_record: DataRecord = record_set.input
+                        output = record_set.data_records[0].passed_operator
+                        full_hash = f"{filter_str}{hash(input_record)}"
+                        if full_hash not in full_hashes:
+                            full_hash_to_bool_output[full_hash] = output
+                            full_hashes.add(full_hash)
+                            futures.append(executor.submit(validator._score_filter, op, filter_str, input_record, output, full_hash))
+                    # create future for join
+                    elif isinstance(op, JoinOp):
+                        condition = op.condition
+                        for left_idx, left_input_record in enumerate(record_set.input[0]):
+                            for right_idx, right_input_record in enumerate(record_set.input[1]):
+                                record_idx = left_idx * len(record_set.input[1]) + right_idx
+                                output = record_set.data_records[record_idx].passed_operator
+                                full_hash = f"{condition}{hash(left_input_record)}{hash(right_input_record)}"
+                                if full_hash not in full_hashes:
+                                    full_hash_to_bool_output[full_hash] = output
+                                    full_hashes.add(full_hash)
+                                    futures.append(executor.submit(validator._score_join, op, condition, left_input_record, right_input_record, output, full_hash))
+        # collect results from futures
+        full_hash_to_score, validation_gen_stats = {}, GenerationStats()
+        for future in as_completed(futures):
+            score, gen_stats, full_hash = future.result()
+            full_hash_to_score[full_hash] = score
+            validation_gen_stats += gen_stats
-            # for each record_set produced by an operation, compute its quality
-            for record_set in record_sets:
-                record_set = self._compute_quality(physical_op_cls, record_set, target_record_set)
+        # compute quality of each output computed by this operator
+        for _, record_set_tuples in source_indices_to_record_sets.items():
+            for record_set, op in record_set_tuples:
+                if is_perfect_quality_op(op) or len(record_set) == 0:
+                    continue
+                if isinstance(op, LLMConvert) and op.cardinality is Cardinality.ONE_TO_ONE:
+                    fields = op.generated_fields
+                    input_record: DataRecord = record_set.input
+                    output_str = record_set.data_records[0].to_json_str(project_cols=fields, bytes_to_str=True, sorted=True)
+                    full_hash = f"{hash(input_record)}{hash(output_str)}"
+                    record_set.record_op_stats[0].quality = full_hash_to_score[full_hash]
+                elif isinstance(op, LLMConvert) and op.cardinality is Cardinality.ONE_TO_MANY:
+                    fields = op.generated_fields
+                    input_record: DataRecord = record_set.input
+                    output_strs = []
+                    for data_record in record_set.data_records:
+                        output_strs.append(data_record.to_json_str(project_cols=fields, bytes_to_str=True, sorted=True))
+                    full_hash = f"{hash(input_record)}{hash(tuple(sorted(output_strs)))}"
+                    score = full_hash_to_score[full_hash]
+                    for record_op_stats in record_set.record_op_stats:
+                        record_op_stats.quality = score
+                # TODO: this scoring function will (likely) bias towards small values of k since it
+                # measures precision and not recall / F1; will need to revisit this in the future
+                elif isinstance(op, RetrieveOp):
+                    fields = op.generated_fields
+                    input_record: DataRecord = record_set.input
+                    output_str = record_set.data_records[0].to_json_str(project_cols=fields, bytes_to_str=True, sorted=True)
+                    full_hash = f"{hash(input_record)}{hash(output_str)}"
+                    score = full_hash_to_score[full_hash]
+                    record_set.record_op_stats[0].quality = score
+                elif isinstance(op, LLMFilter):
+                    filter_str = op.filter_obj.filter_condition
+                    input_record: DataRecord = record_set.input
+                    output = record_set.data_records[0].passed_operator
+                    full_hash = f"{filter_str}{hash(input_record)}"
+                    if output == full_hash_to_bool_output[full_hash]:
+                        record_set.record_op_stats[0].quality = full_hash_to_score[full_hash]
+                    else:
+                        record_set.record_op_stats[0].quality = 1.0 - full_hash_to_score[full_hash]
+                elif isinstance(op, JoinOp):
+                    condition = op.condition
+                    for left_idx, left_input_record in enumerate(record_set.input[0]):
+                        for right_idx, right_input_record in enumerate(record_set.input[1]):
+                            record_idx = left_idx * len(record_set.input[1]) + right_idx
+                            output = record_set.data_records[record_idx].passed_operator
+                            full_hash = f"{condition}{hash(left_input_record)}{hash(right_input_record)}"
+                            if output == full_hash_to_bool_output[full_hash]:
+                                record_set.record_op_stats[record_idx].quality = full_hash_to_score[full_hash]
+                            else:
+                                record_set.record_op_stats[record_idx].quality = 1.0 - full_hash_to_score[full_hash]
         # return the quality annotated record sets
-        return source_idx_to_record_sets
-    def _get_target_record_sets(
-        self,
-        logical_op_id: str,
-        source_idx_to_record_set_tuples: dict[int, list[tuple[DataRecordSet, PhysicalOperator, bool]]],
-        expected_outputs: dict[int, dict] | None,
-    ) -> dict[int, DataRecordSet]:
-        # initialize mapping from source index to target record sets
-        source_idx_to_target_record_set = {}
-        for source_idx, record_set_tuples in source_idx_to_record_set_tuples.items():
-            # get the first generated output for this source_idx
-            base_target_record = None
-            for record_set, _, _ in record_set_tuples:
-                if len(record_set) > 0:
-                    base_target_record = record_set[0]
-                    break
-            # compute availability of data
-            base_target_present = base_target_record is not None
-            labels_present = expected_outputs is not None
-            labels_for_source_present = False
-            if labels_present and source_idx in expected_outputs:
-                labels = expected_outputs[source_idx].get("labels", [])
-                labels_dict_lst = labels if isinstance(labels, list) else [labels]
-                labels_for_source_present = labels_dict_lst != [] and labels_dict_lst != [None]
-            # if we have a base target record and label info, use the label info to construct the target record set
-            if base_target_present and labels_for_source_present:
-                # get the field_to_score_fn
-                field_to_score_fn = expected_outputs[source_idx].get("score_fn", {})
-                # construct the target record set; we force passed_operator to be True for all target records
-                target_records = []
-                for labels_dict in labels_dict_lst:
-                    target_record = base_target_record.copy()
-                    for field, value in labels_dict.items():
-                        target_record[field] = value
-                    target_record.passed_operator = True
-                    target_records.append(target_record)
-                source_idx_to_target_record_set[source_idx] = DataRecordSet(target_records, None, field_to_score_fn)
-                continue
-            # get the best computed output for this (source_idx, logical_op_id) so far (if one exists)
-            champion_record_set, champion_op_quality = None, None
-            if source_idx in self.champion_output_cache and logical_op_id in self.champion_output_cache[source_idx]:
-                champion_record_set, champion_op_quality = self.champion_output_cache[source_idx][logical_op_id]
-            # get the highest quality output that we just computed
-            max_quality_record_set, max_op_quality = self._pick_champion_output(record_set_tuples)
-            # if this new output is of higher quality than our previous champion (or if we didn't have
-            # a previous champion) then we update our champion record set
-            if champion_op_quality is None or (max_op_quality is not None and max_op_quality > champion_op_quality):
-                champion_record_set, champion_op_quality = max_quality_record_set, max_op_quality
-            # update the cache with the new champion record set and quality
-            if source_idx not in self.champion_output_cache:
-                self.champion_output_cache[source_idx] = {}
-            self.champion_output_cache[source_idx][logical_op_id] = (champion_record_set, champion_op_quality)
-            # set the target
-            source_idx_to_target_record_set[source_idx] = champion_record_set
-        return source_idx_to_target_record_set
-    def _pick_champion_output(self, record_set_tuples: list[tuple[DataRecordSet, PhysicalOperator, bool]]) -> tuple[DataRecordSet, float | None]:
-        # find the operator with the highest estimated quality and return its record_set
-        base_op_cost_est = OperatorCostEstimates(cardinality=1.0, cost_per_record=0.0, time_per_record=0.0, quality=1.0)
-        champion_record_set, champion_quality = None, None
-        for record_set, op, _ in record_set_tuples:
-            # skip failed operations
-            if len(record_set) == 0:
-                continue
-            # get the estimated quality of this operator
-            est_quality = op.naive_cost_estimates(base_op_cost_est).quality if self._is_llm_op(op) else 1.0
-            if champion_quality is None or est_quality > champion_quality:
-                champion_record_set, champion_quality = record_set, est_quality
-        return champion_record_set, champion_quality
-    def _flatten_record_sets(self, source_idx_to_record_sets: dict[int, list[DataRecordSet]]) -> tuple[list[DataRecord], list[RecordOpStats]]:
-        """
-        Flatten the list of record sets and record op stats for each source_idx.
-        """
-        all_records, all_record_op_stats = [], []
-        for _, record_sets in source_idx_to_record_sets.items():
-            for record_set in record_sets:
-                all_records.extend(record_set.data_records)
-                all_record_op_stats.extend(record_set.record_op_stats)
-        return all_records, all_record_op_stats
-    def _execute_op_set(self, op_input_pairs: list[tuple[PhysicalOperator, DataRecord | int]]) -> tuple[dict[int, list[tuple[DataRecordSet, PhysicalOperator, bool]]], dict[str, int]]:
-        def execute_op_wrapper(operator, input) -> tuple[DataRecordSet, PhysicalOperator, DataRecord | int]:
-            record_set = operator(input)
-            return record_set, operator, input
-        # TODO: modify unit tests to always have record_op_stats so we can use record_op_stats for source_idx
-        # for scan operators, `input` will be the source_idx
-        def get_source_idx(input):
-            return input.source_idx if isinstance(input, DataRecord) else input
-        def get_hash(operator, input):
+        return source_indices_to_record_sets, validation_gen_stats
+    def _execute_op_set(self, unique_logical_op_id: str, op_inputs: list[tuple[PhysicalOperator, str | tuple, int | DataRecord | list[DataRecord] | tuple[list[DataRecord]]]]) -> tuple[dict[int, list[tuple[DataRecordSet, PhysicalOperator, bool]]], dict[str, int]]:
+        def execute_op_wrapper(operator: PhysicalOperator, source_indices: str | tuple, input: int | DataRecord | list[DataRecord] | tuple[list[DataRecord]]) -> tuple[DataRecordSet, PhysicalOperator, list[DataRecord] | list[int]]:
+            # operator is a join
+            record_set = operator(input[0], input[1]) if isinstance(operator, JoinOp) else operator(input)
+            return record_set, operator, source_indices, input
+        def get_hash(operator: PhysicalOperator, input: int | DataRecord | list[DataRecord] | tuple[list[DataRecord]]):
+            if isinstance(input, list):
+                input = tuple(input)
+            elif isinstance(input, tuple):
+                input = (tuple(input[0]), tuple(input[1]))
             return hash(f"{operator.get_full_op_id()}{hash(input)}")
         # initialize mapping from source indices to output record sets
-        source_idx_to_record_sets_and_ops = {get_source_idx(input): [] for _, input in op_input_pairs}
+        source_indices_to_record_sets_and_ops = {source_indices: [] for _, source_indices, _ in op_inputs}
         # if any operations were previously executed, read the results from the cache
-        final_op_input_pairs = []
-        for operator, input in op_input_pairs:
+        final_op_inputs = []
+        for operator, source_indices, input in op_inputs:
             # compute hash
             op_input_hash = get_hash(operator, input)
             # get result from cache
             if op_input_hash in self.cache:
-                source_idx = get_source_idx(input)
                 record_set, operator = self.cache[op_input_hash]
-                source_idx_to_record_sets_and_ops[source_idx].append((record_set, operator, False))
+                source_indices_to_record_sets_and_ops[source_indices].append((record_set, operator, False))
-            # otherwise, add to final_op_input_pairs
+            # otherwise, add to final_op_inputs
             else:
-                final_op_input_pairs.append((operator, input))
+                final_op_inputs.append((operator, source_indices, input))
         # keep track of the number of llm operations
         num_llm_ops = 0
@@ -408,46 +306,41 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
             # create futures
             futures = [
-                executor.submit(execute_op_wrapper, operator, input)
-                for operator, input in final_op_input_pairs
+                executor.submit(execute_op_wrapper, operator, source_indices, input)
+                for operator, source_indices, input in final_op_inputs
             ]
-            output_record_sets = []
-            while len(futures) > 0:
-                done_futures, not_done_futures = wait(futures, timeout=PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS)
-                for future in done_futures:
-                    # update output record sets
-                    record_set, operator, input = future.result()
-                    output_record_sets.append((record_set, operator, input))
-                    # update cache
-                    op_input_hash = get_hash(operator, input)
-                    self.cache[op_input_hash] = (record_set, operator)
-                    # update progress manager
-                    if self._is_llm_op(operator):
-                        num_llm_ops += 1
-                        self.progress_manager.incr(operator.get_logical_op_id(), num_samples=1, total_cost=record_set.get_total_cost())
+            output_record_sets = []
+            for future in as_completed(futures):
+                # update output record sets
+                record_set, operator, source_indices, input = future.result()
+                output_record_sets.append((record_set, operator, source_indices, input))
-                # update futures
-                futures = list(not_done_futures)
+                # update cache
+                op_input_hash = get_hash(operator, input)
+                self.cache[op_input_hash] = (record_set, operator)
-            # update mapping from source_idx to record sets and operators
-            for record_set, operator, input in output_record_sets:
-                # get the source_idx associated with this input record;
-                source_idx = get_source_idx(input)
+                # update progress manager
+                if self._is_llm_op(operator):
+                    num_llm_ops += 1
+                    self.progress_manager.incr(unique_logical_op_id, num_samples=1, total_cost=record_set.get_total_cost())
-                # add record_set to mapping from source_idx --> record_sets
-                source_idx_to_record_sets_and_ops[source_idx].append((record_set, operator, True))
+            # update mapping from source_indices to record sets and operators
+            for record_set, operator, source_indices, input in output_record_sets:
+                # add record_set to mapping from source_indices --> record_sets
+                record_set.input = input
+                source_indices_to_record_sets_and_ops[source_indices].append((record_set, operator, True))
-        return source_idx_to_record_sets_and_ops, num_llm_ops
+        return source_indices_to_record_sets_and_ops, num_llm_ops
     def _is_llm_op(self, physical_op: PhysicalOperator) -> bool:
         is_llm_convert = isinstance(physical_op, LLMConvert)
         is_llm_filter = isinstance(physical_op, LLMFilter)
         is_llm_retrieve = isinstance(physical_op, RetrieveOp) and isinstance(physical_op.index, Collection)
-        return is_llm_convert or is_llm_filter or is_llm_retrieve
+        is_llm_join = isinstance(physical_op, JoinOp)
+        return is_llm_convert or is_llm_filter or is_llm_retrieve or is_llm_join
     @abstractmethod
-    def execute_sentinel_plan(self, sentinel_plan: SentinelPlan, expected_outputs: dict[str, dict]):
+    def execute_sentinel_plan(self, sentinel_plan: SentinelPlan, train_dataset: dict[str, Dataset], validator: Validator) -> SentinelPlanStats:
         """Execute a SentinelPlan according to strategy"""
         pass

palimpzest 0.7.21__py3-none-any.whl → 0.8.0__py3-none-any.whl

palimpzest 0.7.21py3-none-any.whl → 0.8.0py3-none-any.whl