PyPI - palimpzest - Versions diffs - 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

palimpzest 0.7.21py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +343 -209
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +639 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +62 -6
palimpzest/prompts/filter_prompts.py +51 -6
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
palimpzest/prompts/prompt_factory.py +375 -47
palimpzest/prompts/split_proposer_prompts.py +1 -1
palimpzest/prompts/util_phrases.py +5 -0
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +160 -331
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +33 -19
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +26 -16
palimpzest/query/operators/join.py +403 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +205 -77
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +42 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +32 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
palimpzest-0.8.1.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.21.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/distinct.py ADDED Viewed

@@ -0,0 +1,62 @@
+from __future__ import annotations
+from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.models import OperatorCostEstimates, RecordOpStats
+from palimpzest.query.operators.physical import PhysicalOperator
+class DistinctOp(PhysicalOperator):
+    def __init__(self, distinct_cols: list[str], distinct_seen: set | None = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.distinct_cols = distinct_cols
+        self._distinct_seen = set() if distinct_seen is None else distinct_seen
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Distinct Cols: {self.distinct_cols}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        return {"distinct_cols": self.distinct_cols, **id_params}
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"distinct_cols": self.distinct_cols, "distinct_seen": self._distinct_seen, **op_params}
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        # assume applying the distinct operator takes negligible additional time (and no cost in USD)
+        return OperatorCostEstimates(
+            cardinality=source_op_cost_estimates.cardinality,
+            time_per_record=0,
+            cost_per_record=0,
+            quality=1.0,
+        )
+    def __call__(self, candidate: DataRecord) -> DataRecordSet:
+        # create new DataRecord
+        dr = DataRecord.from_parent(schema=candidate.schema, parent_record=candidate)
+        # output record only if it has not been seen before
+        record_str = dr.to_json_str(project_cols=self.distinct_cols, bytes_to_str=True, sorted=True)
+        record_hash = f"{hash(record_str)}"
+        dr.passed_operator = record_hash not in self._distinct_seen
+        if dr.passed_operator:
+            self._distinct_seen.add(record_hash)
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr.id,
+            record_parent_ids=dr.parent_ids,
+            record_source_indices=dr.source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=0.0,
+            cost_per_record=0.0,
+            passed_operator=dr.passed_operator,
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return DataRecordSet([dr], [record_op_stats])

palimpzest/query/operators/filter.py CHANGED Viewed

@@ -4,6 +4,8 @@ import time
 from abc import ABC, abstractmethod
 from typing import Any
+from pydantic.fields import FieldInfo
 from palimpzest.constants import (
     MODEL_CARDS,
     NAIVE_EST_FILTER_SELECTIVITY,
@@ -12,20 +14,19 @@ from palimpzest.constants import (
     Model,
     PromptStrategy,
 )
-from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates, RecordOpStats
 from palimpzest.core.elements.filters import Filter
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
-from palimpzest.core.lib.fields import BooleanField
-from palimpzest.query.generators.generators import generator_factory
+from palimpzest.core.models import GenerationStats, OperatorCostEstimates, RecordOpStats
+from palimpzest.query.generators.generators import Generator
 from palimpzest.query.operators.physical import PhysicalOperator
-from palimpzest.utils.model_helpers import get_vision_models
 class FilterOp(PhysicalOperator, ABC):
-    def __init__(self, filter: Filter, *args, **kwargs):
+    def __init__(self, filter: Filter, desc: str | None = None, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        assert self.input_schema.get_desc() == self.output_schema.get_desc(), "Input and output schemas must match for FilterOp"
+        assert self.input_schema == self.output_schema, "Input and output schemas must match for FilterOp"
         self.filter_obj = filter
+        self.desc = desc
     def __str__(self):
         op = super().__str__()
@@ -34,11 +35,11 @@ class FilterOp(PhysicalOperator, ABC):
     def get_id_params(self):
         id_params = super().get_id_params()
-        return {"filter": str(self.filter_obj), **id_params}
+        return {"filter": str(self.filter_obj), "desc": self.desc, **id_params}
     def get_op_params(self):
         op_params = super().get_op_params()
-        return {"filter": self.filter_obj, **op_params}
+        return {"filter": self.filter_obj, "desc": self.desc, **op_params}
     @abstractmethod
     def is_image_filter(self) -> bool:
@@ -81,8 +82,8 @@ class FilterOp(PhysicalOperator, ABC):
         # create RecordOpStats object
         record_op_stats = RecordOpStats(
             record_id=dr.id,
-            record_parent_id=dr.parent_id,
-            record_source_idx=dr.source_idx,
+            record_parent_ids=dr.parent_ids,
+            record_source_indices=dr.source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
@@ -174,19 +175,22 @@ class LLMFilter(FilterOp):
         self,
         model: Model,
         prompt_strategy: PromptStrategy = PromptStrategy.COT_BOOL,
+        reasoning_effort: str | None = None,
         *args,
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
         self.model = model
         self.prompt_strategy = prompt_strategy
-        self.generator = generator_factory(model, prompt_strategy, Cardinality.ONE_TO_ONE, self.verbose)
+        self.reasoning_effort = reasoning_effort
+        self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
     def get_id_params(self):
         id_params = super().get_id_params()
         id_params = {
             "model": self.model.value,
             "prompt_strategy": self.prompt_strategy.value,
+            "reasoning_effort": self.reasoning_effort,
             **id_params,
         }
@@ -197,6 +201,7 @@ class LLMFilter(FilterOp):
         op_params = {
             "model": self.model,
             "prompt_strategy": self.prompt_strategy,
+            "reasoning_effort": self.reasoning_effort,
             **op_params,
         }
@@ -206,7 +211,7 @@ class LLMFilter(FilterOp):
         return self.model.value
     def is_image_filter(self) -> bool:
-        return self.model in get_vision_models()
+        return self.prompt_strategy is PromptStrategy.COT_BOOL_IMAGE
     def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates):
         # estimate number of input tokens from source
@@ -225,8 +230,13 @@ class LLMFilter(FilterOp):
         )
         # get est. of conversion cost (in USD) per record from model card
+        usd_per_input_token = (
+            MODEL_CARDS[self.model.value]["usd_per_audio_input_token"]
+            if self.prompt_strategy.is_audio_prompt()
+            else MODEL_CARDS[self.model.value]["usd_per_input_token"]
+        )
         model_conversion_usd_per_record = (
-            MODEL_CARDS[self.model.value]["usd_per_input_token"] * est_num_input_tokens
+            usd_per_input_token * est_num_input_tokens
             + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
         )
@@ -235,7 +245,7 @@ class LLMFilter(FilterOp):
         cardinality = selectivity * source_op_cost_estimates.cardinality
         # estimate quality of output based on the strength of the model being used
-        quality = (MODEL_CARDS[self.model.value]["overall"] / 100.0) * source_op_cost_estimates.quality
+        quality = (MODEL_CARDS[self.model.value]["overall"] / 100.0)
         return OperatorCostEstimates(
             cardinality=cardinality,
@@ -251,8 +261,8 @@ class LLMFilter(FilterOp):
         # construct kwargs for generation
         gen_kwargs = {"project_cols": input_fields, "filter_condition": self.filter_obj.filter_condition}
-        # generate output; NOTE: BooleanField is used to indicate the output type; thus, the desc is not needed
-        fields = {"passed_operator": BooleanField(desc="")}
+        # generate output; NOTE: FieldInfo is used to indicate the output type; thus, the desc is not needed
+        fields = {"passed_operator": FieldInfo(annotation=bool, description="Whether the record passed the filter operation")}
         field_answers, _, generation_stats, _ = self.generator(candidate, fields, **gen_kwargs)
         return field_answers, generation_stats

palimpzest/query/operators/join.py ADDED Viewed

@@ -0,0 +1,403 @@
+from __future__ import annotations
+import time
+from abc import ABC, abstractmethod
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pydantic.fields import FieldInfo
+from palimpzest.constants import (
+    MODEL_CARDS,
+    NAIVE_EST_JOIN_SELECTIVITY,
+    NAIVE_EST_NUM_INPUT_TOKENS,
+    Cardinality,
+    Model,
+    PromptStrategy,
+)
+from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.models import OperatorCostEstimates, RecordOpStats
+from palimpzest.query.generators.generators import Generator
+from palimpzest.query.operators.physical import PhysicalOperator
+class JoinOp(PhysicalOperator, ABC):
+    def __init__(self, condition: str, desc: str | None = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.input_schema == self.output_schema, "Input and output schemas must match for JoinOp"
+        self.condition = condition
+        self.desc = desc
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Condition: {self.condition}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        return {"condition": self.condition, "desc": self.desc, **id_params}
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"condition": self.condition, "desc": self.desc, **op_params}
+    @abstractmethod
+    def is_image_join(self) -> bool:
+        """Return True if the join operation processes image(s), False otherwise."""
+        pass
+    @abstractmethod
+    def naive_cost_estimates(self, left_source_op_cost_estimates: OperatorCostEstimates, right_source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        pass
+class BlockingNestedLoopsJoin(JoinOp):
+    def __init__(
+        self,
+        model: Model,
+        prompt_strategy: PromptStrategy = PromptStrategy.COT_JOIN,
+        join_parallelism: int = 64,
+        reasoning_effort: str | None = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.model = model
+        self.prompt_strategy = prompt_strategy
+        self.join_parallelism = join_parallelism
+        self.reasoning_effort = reasoning_effort
+        self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
+        self.join_idx = 0
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "model": self.model.value,
+            "prompt_strategy": self.prompt_strategy.value,
+            "join_parallelism": self.join_parallelism,
+            "reasoning_effort": self.reasoning_effort,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "model": self.model,
+            "prompt_strategy": self.prompt_strategy,
+            "join_parallelism": self.join_parallelism,
+            "reasoning_effort": self.reasoning_effort,
+            **op_params,
+        }
+        return op_params
+    def get_model_name(self):
+        return self.model.value
+    def is_image_join(self) -> bool:
+        return self.prompt_strategy is PromptStrategy.COT_JOIN_IMAGE
+    def naive_cost_estimates(self, left_source_op_cost_estimates: OperatorCostEstimates, right_source_op_cost_estimates: OperatorCostEstimates):
+        # estimate number of input tokens from source
+        est_num_input_tokens = 2 * NAIVE_EST_NUM_INPUT_TOKENS
+        if self.is_image_join():
+            est_num_input_tokens = 2 * 765 / 10  # 1024x1024 image is 765 tokens
+        # NOTE: the output often generates an entire reasoning sentence, thus the true value may be higher
+        # the filter operation's LLM call should only output TRUE or FALSE, thus we expect its
+        # number of output tokens to be ~1.25
+        est_num_output_tokens = 1.25
+        # get est. of conversion time per record from model card;
+        model_conversion_time_per_record = (
+            MODEL_CARDS[self.model.value]["seconds_per_output_token"] * est_num_output_tokens
+        )
+        # get est. of conversion cost (in USD) per record from model card
+        usd_per_input_token = (
+            MODEL_CARDS[self.model.value]["usd_per_audio_input_token"]
+            if self.prompt_strategy.is_audio_prompt()
+            else MODEL_CARDS[self.model.value]["usd_per_input_token"]
+        )
+        model_conversion_usd_per_record = (
+            usd_per_input_token * est_num_input_tokens
+            + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
+        )
+        # estimate output cardinality using a constant assumption of the filter selectivity
+        selectivity = NAIVE_EST_JOIN_SELECTIVITY
+        cardinality = selectivity * (left_source_op_cost_estimates.cardinality * right_source_op_cost_estimates.cardinality)
+        # estimate quality of output based on the strength of the model being used
+        quality = (MODEL_CARDS[self.model.value]["overall"] / 100.0)
+        return OperatorCostEstimates(
+            cardinality=cardinality,
+            time_per_record=model_conversion_time_per_record,
+            cost_per_record=model_conversion_usd_per_record,
+            quality=quality,
+        )
+    def _process_join_candidate_pair(
+        self,
+        left_candidate: DataRecord,
+        right_candidate: DataRecord,
+        gen_kwargs: dict,
+    ) -> tuple[list[DataRecord], list[RecordOpStats]]:
+        start_time = time.time()
+        # generate output; NOTE: FieldInfo is used to indicate the output type; thus, the desc is not needed
+        fields = {"passed_operator": FieldInfo(annotation=bool, description="Whether the records satisfy the join condition")}
+        field_answers, _, generation_stats, _ = self.generator(left_candidate, fields, right_candidate=right_candidate, **gen_kwargs)
+        # determine whether or not the join was satisfied
+        passed_operator = field_answers["passed_operator"]
+        # compute output record and add to output_records
+        join_dr = DataRecord.from_join_parents(self.output_schema, left_candidate, right_candidate)
+        join_dr.passed_operator = passed_operator
+        # compute record stats and add to output_record_op_stats
+        record_op_stats = RecordOpStats(
+            record_id=join_dr.id,
+            record_parent_ids=join_dr.parent_ids,
+            record_source_indices=join_dr.source_indices,
+            record_state=join_dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=generation_stats.cost_per_record,
+            model_name=self.get_model_name(),
+            join_condition=self.condition,
+            total_input_tokens=generation_stats.total_input_tokens,
+            total_output_tokens=generation_stats.total_output_tokens,
+            total_input_cost=generation_stats.total_input_cost,
+            total_output_cost=generation_stats.total_output_cost,
+            llm_call_duration_secs=generation_stats.llm_call_duration_secs,
+            fn_call_duration_secs=generation_stats.fn_call_duration_secs,
+            total_llm_calls=generation_stats.total_llm_calls,
+            total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
+            answer=field_answers,
+            passed_operator=passed_operator,
+            image_operation=self.is_image_join(),
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return [join_dr], [record_op_stats]
+    def __call__(self, left_candidates: list[DataRecord], right_candidates: list[DataRecord]) -> tuple[DataRecordSet, int]:
+        # get the set of input fields from both records in the join
+        input_fields = self.get_input_fields()
+        # construct kwargs for generation
+        gen_kwargs = {"project_cols": input_fields, "join_condition": self.condition}
+        # apply the generator to each pair of candidates
+        output_records, output_record_op_stats, num_inputs_processed = [], [], 0
+        total_join_candidates = len(left_candidates) * len(right_candidates)
+        with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
+            futures = []
+            for candidate in left_candidates:
+                for right_candidate in right_candidates:
+                    futures.append(executor.submit(self._process_join_candidate_pair, candidate, right_candidate, gen_kwargs))
+                    num_inputs_processed += 1
+            for future in as_completed(futures):
+                self.join_idx += 1
+                join_output_records, join_output_record_op_stats = future.result()
+                output_records.extend(join_output_records)
+                output_record_op_stats.extend(join_output_record_op_stats)
+                print(f"{self.join_idx}/{total_join_candidates} JOINED")
+        return DataRecordSet(output_records, output_record_op_stats), num_inputs_processed
+class NestedLoopsJoin(JoinOp):
+    def __init__(
+        self,
+        model: Model,
+        prompt_strategy: PromptStrategy = PromptStrategy.COT_JOIN,
+        join_parallelism: int = 64,
+        reasoning_effort: str | None = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.model = model
+        self.prompt_strategy = prompt_strategy
+        self.join_parallelism = join_parallelism
+        self.reasoning_effort = reasoning_effort
+        self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
+        self.join_idx = 0
+        # maintain list(s) of input records for the join
+        self._left_input_records: list[DataRecord] = []
+        self._right_input_records: list[DataRecord] = []
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "model": self.model.value,
+            "prompt_strategy": self.prompt_strategy.value,
+            "join_parallelism": self.join_parallelism,
+            "reasoning_effort": self.reasoning_effort,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "model": self.model,
+            "prompt_strategy": self.prompt_strategy,
+            "join_parallelism": self.join_parallelism,
+            "reasoning_effort": self.reasoning_effort,
+            **op_params,
+        }
+        return op_params
+    def get_model_name(self):
+        return self.model.value
+    def is_image_join(self) -> bool:
+        return self.prompt_strategy is PromptStrategy.COT_JOIN_IMAGE
+    def naive_cost_estimates(self, left_source_op_cost_estimates: OperatorCostEstimates, right_source_op_cost_estimates: OperatorCostEstimates):
+        # estimate number of input tokens from source
+        est_num_input_tokens = 2 * NAIVE_EST_NUM_INPUT_TOKENS
+        if self.is_image_join():
+            est_num_input_tokens = 2 * 765 / 10  # 1024x1024 image is 765 tokens
+        # NOTE: the output often generates an entire reasoning sentence, thus the true value may be higher
+        # the filter operation's LLM call should only output TRUE or FALSE, thus we expect its
+        # number of output tokens to be ~1.25
+        est_num_output_tokens = 1.25
+        # get est. of conversion time per record from model card;
+        model_conversion_time_per_record = (
+            MODEL_CARDS[self.model.value]["seconds_per_output_token"] * est_num_output_tokens
+        )
+        # get est. of conversion cost (in USD) per record from model card
+        usd_per_input_token = (
+            MODEL_CARDS[self.model.value]["usd_per_audio_input_token"]
+            if self.prompt_strategy.is_audio_prompt()
+            else MODEL_CARDS[self.model.value]["usd_per_input_token"]
+        )
+        model_conversion_usd_per_record = (
+            usd_per_input_token * est_num_input_tokens
+            + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
+        )
+        # estimate output cardinality using a constant assumption of the filter selectivity
+        selectivity = NAIVE_EST_JOIN_SELECTIVITY
+        cardinality = selectivity * (left_source_op_cost_estimates.cardinality * right_source_op_cost_estimates.cardinality)
+        # estimate quality of output based on the strength of the model being used
+        quality = (MODEL_CARDS[self.model.value]["overall"] / 100.0)
+        return OperatorCostEstimates(
+            cardinality=cardinality,
+            time_per_record=model_conversion_time_per_record,
+            cost_per_record=model_conversion_usd_per_record,
+            quality=quality,
+        )
+    def _process_join_candidate_pair(
+        self,
+        left_candidate: DataRecord,
+        right_candidate: DataRecord,
+        gen_kwargs: dict,
+    ) -> tuple[list[DataRecord], list[RecordOpStats]]:
+        start_time = time.time()
+        # generate output; NOTE: FieldInfo is used to indicate the output type; thus, the desc is not needed
+        fields = {"passed_operator": FieldInfo(annotation=bool, description="Whether the records satisfy the join condition")}
+        field_answers, _, generation_stats, _ = self.generator(left_candidate, fields, right_candidate=right_candidate, **gen_kwargs)
+        # determine whether or not the join was satisfied
+        passed_operator = field_answers["passed_operator"]
+        # compute output record and add to output_records
+        join_dr = DataRecord.from_join_parents(self.output_schema, left_candidate, right_candidate)
+        join_dr.passed_operator = passed_operator
+        # compute record stats and add to output_record_op_stats
+        record_op_stats = RecordOpStats(
+            record_id=join_dr.id,
+            record_parent_ids=join_dr.parent_ids,
+            record_source_indices=join_dr.source_indices,
+            record_state=join_dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=generation_stats.cost_per_record,
+            model_name=self.get_model_name(),
+            join_condition=self.condition,
+            total_input_tokens=generation_stats.total_input_tokens,
+            total_output_tokens=generation_stats.total_output_tokens,
+            total_input_cost=generation_stats.total_input_cost,
+            total_output_cost=generation_stats.total_output_cost,
+            llm_call_duration_secs=generation_stats.llm_call_duration_secs,
+            fn_call_duration_secs=generation_stats.fn_call_duration_secs,
+            total_llm_calls=generation_stats.total_llm_calls,
+            total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
+            answer=field_answers,
+            passed_operator=passed_operator,
+            image_operation=self.is_image_join(),
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return [join_dr], [record_op_stats]
+    def __call__(self, left_candidates: list[DataRecord], right_candidates: list[DataRecord]) -> tuple[DataRecordSet | None, int]:
+        # get the set of input fields from both records in the join
+        input_fields = self.get_input_fields()
+        # construct kwargs for generation
+        gen_kwargs = {"project_cols": input_fields, "join_condition": self.condition}
+        # apply the generator to each pair of candidates
+        output_records, output_record_op_stats, num_inputs_processed = [], [], 0
+        with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
+            futures = []
+            # join new left candidates with new right candidates
+            for candidate in left_candidates:
+                for right_candidate in right_candidates:
+                    futures.append(executor.submit(self._process_join_candidate_pair, candidate, right_candidate, gen_kwargs))
+                    num_inputs_processed += 1
+            # join new left candidates with stored right input records
+            for candidate in left_candidates:
+                for right_candidate in self._right_input_records:
+                    futures.append(executor.submit(self._process_join_candidate_pair, candidate, right_candidate, gen_kwargs))
+                    num_inputs_processed += 1
+            # join new right candidates with stored left input records
+            for candidate in self._left_input_records:
+                for right_candidate in right_candidates:
+                    futures.append(executor.submit(self._process_join_candidate_pair, candidate, right_candidate, gen_kwargs))
+                    num_inputs_processed += 1
+            # collect results as they complete
+            for future in as_completed(futures):
+                self.join_idx += 1
+                join_output_records, join_output_record_op_stats = future.result()
+                output_records.extend(join_output_records)
+                output_record_op_stats.extend(join_output_record_op_stats)
+                print(f"{self.join_idx} JOINED")
+        # store input records to join with new records added later
+        self._left_input_records.extend(left_candidates)
+        self._right_input_records.extend(right_candidates)
+        # return None if no output records were produced
+        if len(output_records) == 0:
+            return None, num_inputs_processed
+        return DataRecordSet(output_records, output_record_op_stats), num_inputs_processed

palimpzest/query/operators/limit.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-from palimpzest.core.data.dataclasses import OperatorCostEstimates, RecordOpStats
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.models import OperatorCostEstimates, RecordOpStats
 from palimpzest.query.operators.physical import PhysicalOperator
@@ -41,8 +41,8 @@ class LimitScanOp(PhysicalOperator):
         # create RecordOpStats object
         record_op_stats = RecordOpStats(
             record_id=dr.id,
-            record_parent_id=dr.parent_id,
-            record_source_idx=dr.source_idx,
+            record_parent_ids=dr.parent_ids,
+            record_source_indices=dr.source_indices,
             record_state=dr.to_dict(include_bytes=False),
             full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,

palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl

palimpzest 0.7.21py3-none-any.whl → 0.8.1py3-none-any.whl