PyPI - palimpzest - Versions diffs - 1.0.0__tar.gz → 1.1.1__tar.gz - Mend

palimpzest 1.0.0tar.gz → 1.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

{palimpzest-1.0.0/src/palimpzest.egg-info → palimpzest-1.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 1.0.0
+Version: 1.1.1
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org

{palimpzest-1.0.0 → palimpzest-1.1.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "palimpzest"
-version = "1.0.0"
+version = "1.1.1"
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
 requires-python = ">=3.12"

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/core/elements/groupbysig.py RENAMED Viewed

@@ -11,6 +11,11 @@ from palimpzest.core.lib.schemas import create_schema_from_fields
 # - construct the correct output schema using the input schema and the group by and aggregation fields
 # - remove/update all other references to GroupBySig in the codebase
+# TODO:
+# - move the arguments for group_by_fields, agg_funcs, and agg_fields into the Dataset.groupby() operator
+# - construct the correct output schema using the input schema and the group by and aggregation fields
+# - remove/update all other references to GroupBySig in the codebase
 # signature for a group by aggregate that applies
 # group and aggregation to an input tuple
 class GroupBySig:

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/core/models.py RENAMED Viewed

@@ -454,6 +454,12 @@ class BasePlanStats(BaseModel):
         """
         return sum([gen_stats.total_output_tokens for _, gen_stats in self.validation_gen_stats.items()])
+    def get_total_cost_so_far(self) -> float:
+        """
+        Get the total cost incurred so far in this plan execution.
+        """
+        return self.sum_op_costs() + self.sum_validation_costs()
 class PlanStats(BasePlanStats):
     """

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/prompts/prompt_factory.py RENAMED Viewed

@@ -830,7 +830,7 @@ class PromptFactory:
                 field_type = dr.get_field_type(field_name)
                 # audio filepath (or list of audio filepaths)
-                if field_type.annotation in [AudioFilepath, AudioFilepath | None, AudioFilepath | Any]:
+                if field_type.annotation in [AudioFilepath, AudioFilepath | None, AudioFilepath | Any] and field_value is not None:
                     with open(field_value, "rb") as f:
                         base64_audio_str = base64.b64encode(f.read()).decode("utf-8")
                     audio_content.append(
@@ -839,6 +839,8 @@ class PromptFactory:
                 elif field_type.annotation in [list[AudioFilepath], list[AudioFilepath] | None, list[AudioFilepath] | Any]:
                     for audio_filepath in field_value:
+                        if audio_filepath is None:
+                            continue
                         with open(audio_filepath, "rb") as f:
                             base64_audio_str = base64.b64encode(f.read()).decode("utf-8")
                         audio_content.append(
@@ -846,13 +848,15 @@ class PromptFactory:
                         )
                 # pre-encoded images (or list of pre-encoded images)
-                elif field_type.annotation in [AudioBase64, AudioBase64 | None, AudioBase64 | Any]:
+                elif field_type.annotation in [AudioBase64, AudioBase64 | None, AudioBase64 | Any] and field_value is not None:
                     audio_content.append(
                         {"type": "input_audio", "input_audio": {"data": field_value, "format": "wav"}}
                     )
                 elif field_type.annotation in [list[AudioBase64], list[AudioBase64] | None, list[AudioBase64] | Any]:
                     for base64_audio in field_value:
+                        if base64_audio is None:
+                            continue
                         audio_content.append(
                             {"type": "input_audio", "input_audio": {"data": base64_audio, "format": "wav"}}
                         )
@@ -882,7 +886,7 @@ class PromptFactory:
                 field_type = dr.get_field_type(field_name)
                 # image filepath (or list of image filepaths)
-                if field_type.annotation in [ImageFilepath, ImageFilepath | None, ImageFilepath | Any]:
+                if field_type.annotation in [ImageFilepath, ImageFilepath | None, ImageFilepath | Any] and field_value is not None:
                     with open(field_value, "rb") as f:
                         base64_image_str = base64.b64encode(f.read()).decode("utf-8")
                     image_content.append(
@@ -891,6 +895,8 @@ class PromptFactory:
                 elif field_type.annotation in [list[ImageFilepath], list[ImageFilepath] | None, list[ImageFilepath] | Any]:
                     for image_filepath in field_value:
+                        if image_filepath is None:
+                            continue
                         with open(image_filepath, "rb") as f:
                             base64_image_str = base64.b64encode(f.read()).decode("utf-8")
                         image_content.append(
@@ -898,21 +904,25 @@ class PromptFactory:
                         )
                 # image url (or list of image urls)
-                elif field_type.annotation in [ImageURL, ImageURL | None, ImageURL | Any]:
+                elif field_type.annotation in [ImageURL, ImageURL | None, ImageURL | Any] and field_value is not None:
                     image_content.append({"type": "image_url", "image_url": {"url": field_value}})
                 elif field_type.annotation in [list[ImageURL], list[ImageURL] | None, list[ImageURL] | Any]:
                     for image_url in field_value:
+                        if image_url is None:
+                            continue
                         image_content.append({"type": "image_url", "image_url": {"url": image_url}})
                 # pre-encoded images (or list of pre-encoded images)
-                elif field_type.annotation in [ImageBase64, ImageBase64 | None, ImageBase64 | Any]:
+                elif field_type.annotation in [ImageBase64, ImageBase64 | None, ImageBase64 | Any] and field_value is not None:
                     image_content.append(
                         {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{field_value}"}}
                     )
                 elif field_type.annotation in [list[ImageBase64], list[ImageBase64] | None, list[ImageBase64] | Any]:
                     for base64_image in field_value:
+                        if base64_image is None:
+                            continue
                         image_content.append(
                             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                         )

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/query/execution/execution_strategy.py RENAMED Viewed

@@ -82,14 +82,16 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
     """
     def __init__(
         self,
-        k: int,
-        j: int,
-        sample_budget: int,
         policy: Policy,
+        k: int = 6,
+        j: int = 4,
+        sample_budget: int = 100,
+        sample_cost_budget: float | None = None,
         priors: dict | None = None,
         use_final_op_quality: bool = False,
         seed: int = 42,
         exp_name: str | None = None,
+        dont_use_priors: bool = False,
         *args,
         **kwargs,
     ):
@@ -97,12 +99,14 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         self.k = k
         self.j = j
         self.sample_budget = sample_budget
+        self.sample_cost_budget = sample_cost_budget
         self.policy = policy
         self.priors = priors
         self.use_final_op_quality = use_final_op_quality
         self.seed = seed
         self.rng = np.random.default_rng(seed=seed)
         self.exp_name = exp_name
+        self.dont_use_priors = dont_use_priors
         # general cache which maps hash(logical_op_id, phys_op_id, hash(input)) --> record_set
         self.cache: dict[int, DataRecordSet] = {}

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/query/execution/mab_execution_strategy.py RENAMED Viewed

@@ -44,6 +44,7 @@ class OpFrontier:
             seed: int,
             policy: Policy,
             priors: dict | None = None,
+            dont_use_priors: bool = False,
         ):
         # set k and j, which are the initial number of operators in the frontier and the
         # initial number of records to sample for each frontier operator
@@ -51,6 +52,7 @@ class OpFrontier:
         self.j = j
         self.source_indices = source_indices
         self.root_dataset_ids = root_dataset_ids
+        self.dont_use_priors = dont_use_priors
         # store the policy that we are optimizing under
         self.policy = policy
@@ -68,6 +70,7 @@ class OpFrontier:
         is_llm_filter = isinstance(sample_op, LLMFilter)
         is_llm_topk = isinstance(sample_op, TopKOp) and isinstance(sample_op.index, Collection)
         self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_topk or self.is_llm_join
+        self.is_llm_convert = is_llm_convert
         # get order in which we will sample physical operators for this logical operator
         sample_op_indices = self._get_op_index_order(op_set, seed)
@@ -190,7 +193,9 @@ class OpFrontier:
         Returns a list of indices for the operators in the op_set.
         """
         # if this is not an llm-operator, we simply return the indices in random order
-        if not self.is_llm_op:
+        if not self.is_llm_op or self.dont_use_priors:
+            if self.is_llm_convert:
+                print("Using NO PRIORS for operator sampling order")
             rng = np.random.default_rng(seed=seed)
             op_indices = np.arange(len(op_set))
             rng.shuffle(op_indices)
@@ -198,6 +203,8 @@ class OpFrontier:
         # if this is an llm-operator, but we do not have priors, we first compute naive priors
         if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
+            if self.is_llm_convert:
+                print("Using NAIVE PRIORS for operator sampling order")
             self.priors = self._compute_naive_priors(op_set)
         # NOTE: self.priors is a dictionary with format:
@@ -680,6 +687,9 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
         return max_quality_op
+    def _compute_termination_condition(self, samples_drawn: int, sampling_cost: float) -> bool:
+        return (samples_drawn >= self.sample_budget) if self.sample_cost_budget is None else (sampling_cost >= self.sample_cost_budget)
     def _execute_sentinel_plan(
             self,
             plan: SentinelPlan,
@@ -688,8 +698,8 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
             plan_stats: SentinelPlanStats,
         ) -> SentinelPlanStats:
         # sample records and operators and update the frontiers
-        samples_drawn = 0
-        while samples_drawn < self.sample_budget:
+        samples_drawn, sampling_cost = 0, 0.0
+        while not self._compute_termination_condition(samples_drawn, sampling_cost):
             # pre-compute the set of source indices which will need to be sampled
             source_indices_to_sample = set()
             for op_frontier in op_frontiers.values():
@@ -732,6 +742,9 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 }
                 source_indices_to_all_record_sets, val_gen_stats = self._score_quality(validator, source_indices_to_all_record_sets)
+                # update the progress manager with validation cost
+                self.progress_manager.incr_overall_progress_cost(val_gen_stats.cost_per_record)
                 # remove records that were read from the execution cache before adding to record op stats
                 new_record_op_stats = []
                 for _, record_set_tuples in source_indices_to_record_set_tuples.items():
@@ -742,6 +755,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 # update plan stats
                 plan_stats.add_record_op_stats(unique_logical_op_id, new_record_op_stats)
                 plan_stats.add_validation_gen_stats(unique_logical_op_id, val_gen_stats)
+                sampling_cost = plan_stats.get_total_cost_so_far()
                 # provide the best record sets as inputs to the next logical operator
                 next_unique_logical_op_id = plan.get_next_unique_logical_op_id(unique_logical_op_id)
@@ -798,7 +812,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 assert len(root_dataset_ids) == 1, f"Scan for {sample_op} has {len(root_dataset_ids)} > 1 root dataset ids"
                 root_dataset_id = root_dataset_ids[0]
                 source_indices = dataset_id_to_shuffled_source_indices[root_dataset_id]
-                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors)
+                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors, self.dont_use_priors)
             elif isinstance(sample_op, JoinOp):
                 assert len(source_unique_logical_op_ids) == 2, f"Join for {sample_op} has {len(source_unique_logical_op_ids)} != 2 source logical operators"
                 left_source_indices = op_frontiers[source_unique_logical_op_ids[0]].source_indices
@@ -807,13 +821,13 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 for left_source_idx in left_source_indices:
                     for right_source_idx in right_source_indices:
                         source_indices.append((left_source_idx, right_source_idx))
-                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors)
+                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors, self.dont_use_priors)
             else:
                 source_indices = op_frontiers[source_unique_logical_op_ids[0]].source_indices
-                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors)
+                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors, self.dont_use_priors)
         # initialize and start the progress manager
-        self.progress_manager = create_progress_manager(plan, sample_budget=self.sample_budget, progress=self.progress)
+        self.progress_manager = create_progress_manager(plan, sample_budget=self.sample_budget, sample_cost_budget=self.sample_cost_budget, progress=self.progress)
         self.progress_manager.start()
         # NOTE: we must handle progress manager outside of _execute_sentinel_plan to ensure that it is shut down correctly;

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/query/generators/generators.py RENAMED Viewed

@@ -338,7 +338,7 @@ class Generator(Generic[ContextType, InputType]):
                     reasoning_effort = "minimal" if self.reasoning_effort is None else self.reasoning_effort
                     completion_kwargs = {"reasoning_effort": reasoning_effort, **completion_kwargs}
             if self.model.is_vllm_model():
-                completion_kwargs = {"api_base": self.api_base, "api_key": os.environ.get("VLLM_API_KEY", "fake-api-key") **completion_kwargs}
+                completion_kwargs = {"api_base": self.api_base, "api_key": os.environ.get("VLLM_API_KEY", "fake-api-key"), **completion_kwargs}
             completion = litellm.completion(model=self.model_name, messages=messages, **completion_kwargs)
             end_time = time.time()
             logger.debug(f"Generated completion in {end_time - start_time:.2f} seconds")

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/query/operators/join.py RENAMED Viewed

@@ -27,6 +27,25 @@ from palimpzest.query.generators.generators import Generator
 from palimpzest.query.operators.physical import PhysicalOperator
+class Singleton:
+     def __new__(cls, *args, **kw):
+         if not hasattr(cls, '_instance'):
+             orig = super(Singleton, cls)  # noqa: UP008
+             cls._instance = orig.__new__(cls, *args, **kw)
+         return cls._instance
+class Locks(Singleton):
+    model = None
+    clip_lock = threading.Lock()
+    exec_lock = threading.Lock()
+    @classmethod
+    def get_model(cls, model_name: str):
+        with cls.clip_lock:
+            if cls.model is None:
+                cls.model = SentenceTransformer(model_name)
+            return cls.model
 def compute_similarity(left_embedding: list[float], right_embedding: list[float]) -> float:
     """
     Compute the similarity between two embeddings using cosine similarity.
@@ -487,8 +506,7 @@ class EmbeddingJoin(LLMJoin):
             if field_name.split(".")[-1] in self.get_input_fields()
         ])
         self.embedding_model = Model.TEXT_EMBEDDING_3_SMALL if self.text_only else Model.CLIP_VIT_B_32
-        self.clip_model = None
-        self._lock = threading.Lock()
+        self.locks = Locks()
         # keep track of embedding costs that could not be amortized if no output records were produced
         self.residual_embedding_cost = 0.0
@@ -560,12 +578,6 @@ class EmbeddingJoin(LLMJoin):
             quality=quality,
         )
-    def _get_clip_model(self):
-        with self._lock:
-            if self.clip_model is None:
-                self.clip_model = SentenceTransformer(self.embedding_model.value)
-            return self.clip_model
     def _compute_embeddings(self, candidates: list[DataRecord], input_fields: list[str]) -> tuple[np.ndarray, GenerationStats]:
         # return empty array and empty stats if no candidates
         if len(candidates) == 0:
@@ -581,7 +593,7 @@ class EmbeddingJoin(LLMJoin):
             total_input_tokens = response.usage.total_tokens
             embeddings = np.array([item.embedding for item in response.data])
         else:
-            model = self._get_clip_model()
+            model = self.locks.get_model(self.embedding_model.value)
             embeddings = np.zeros((len(candidates), 512))  # CLIP embeddings are 512-dimensional
             num_input_fields_present = 0
             for field in input_fields:
@@ -623,7 +635,7 @@ class EmbeddingJoin(LLMJoin):
         output_record, output_record_op_stats = super()._process_join_candidate_pair(left_candidate, right_candidate, gen_kwargs)
         return output_record, output_record_op_stats, embedding_sim
-    def _process_join_candidate_with_sim(self, left_candidate: DataRecord, right_candidate: DataRecord, passed_operator: bool) -> tuple[DataRecord, RecordOpStats]:
+    def _process_join_candidate_with_sim(self, left_candidate: DataRecord, right_candidate: DataRecord, embedding_sim: float, passed_operator: bool) -> tuple[DataRecord, RecordOpStats]:
         # compute output record and add to output_records
         join_dr = DataRecord.from_join_parents(self.output_schema, left_candidate, right_candidate)
         join_dr._passed_operator = passed_operator
@@ -656,7 +668,7 @@ class EmbeddingJoin(LLMJoin):
             op_details={k: str(v) for k, v in self.get_id_params().items()},
         )
-        return join_dr, record_op_stats
+        return join_dr, record_op_stats, embedding_sim
     def __call__(self, left_candidates: list[DataRecord], right_candidates: list[DataRecord], final: bool = False) -> tuple[DataRecordSet, int]:
         # get the set of input fields from both records in the join
@@ -690,36 +702,50 @@ class EmbeddingJoin(LLMJoin):
         output_records, output_record_op_stats, num_inputs_processed = [], [], 0
         # draw samples until num_samples is reached
-        if self.samples_drawn < self.num_samples:
-            samples_to_draw = min(self.num_samples - self.samples_drawn, len(join_candidates))
-            join_candidate_samples = join_candidates[:samples_to_draw]
-            join_candidates = join_candidates[samples_to_draw:]
-            # apply the generator to each pair of candidates
-            with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
-                futures = [
-                    executor.submit(self._process_join_candidate_pair, left_candidate, right_candidate, gen_kwargs, embedding_sim)
-                    for left_candidate, right_candidate, embedding_sim in join_candidate_samples
-                ]
-                # collect results as they complete
-                for future in as_completed(futures):
-                    self.join_idx += 1
-                    join_output_record, join_output_record_op_stats, embedding_sim = future.result()
-                    output_records.append(join_output_record)
-                    output_record_op_stats.append(join_output_record_op_stats)
-                    print(f"{self.join_idx} JOINED")
-                    # update similarity thresholds
-                    records_joined = join_output_record._passed_operator
-                    if not records_joined and embedding_sim > self.max_non_matching_sim:
-                        self.max_non_matching_sim = embedding_sim
-                    if records_joined and embedding_sim < self.min_matching_sim:
-                        self.min_matching_sim = embedding_sim
-            # update samples drawn and num_inputs_processed
-            self.samples_drawn += samples_to_draw
-            num_inputs_processed += samples_to_draw
+        with self.locks.exec_lock:
+            if self.samples_drawn < self.num_samples:
+                samples_to_draw = min(self.num_samples - self.samples_drawn, len(join_candidates))
+                join_candidate_samples = join_candidates[:samples_to_draw]
+                join_candidates = join_candidates[samples_to_draw:]
+                # apply the generator to each pair of candidates
+                with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
+                    futures = [
+                        executor.submit(self._process_join_candidate_pair, left_candidate, right_candidate, gen_kwargs, embedding_sim)
+                        for left_candidate, right_candidate, embedding_sim in join_candidate_samples
+                    ]
+                    # collect results as they complete
+                    similarities, joined = [], []
+                    for future in as_completed(futures):
+                        self.join_idx += 1
+                        join_output_record, join_output_record_op_stats, embedding_sim = future.result()
+                        output_records.append(join_output_record)
+                        output_record_op_stats.append(join_output_record_op_stats)
+                        similarities.append(embedding_sim)
+                        joined.append(join_output_record._passed_operator)
+                        print(f"{self.join_idx} JOINED")
+                    # sort join results by embedding similarity
+                    sorted_sim_join_tuples = sorted(zip(similarities, joined), key=lambda x: x[0])
+                    # compute threshold below which no records joined
+                    for embedding_sim, records_joined in sorted_sim_join_tuples:
+                        if records_joined:
+                            break
+                        if not records_joined and embedding_sim > self.max_non_matching_sim:
+                            self.max_non_matching_sim = embedding_sim
+                    # compute threshold above which all records joined
+                    for embedding_sim, records_joined in reversed(sorted_sim_join_tuples):
+                        if not records_joined:
+                            break
+                        if records_joined and embedding_sim < self.min_matching_sim:
+                            self.min_matching_sim = embedding_sim
+                # update samples drawn and num_inputs_processed
+                self.samples_drawn += samples_to_draw
+                num_inputs_processed += samples_to_draw
         # process remaining candidates based on embedding similarity
         if len(join_candidates) > 0:
@@ -727,43 +753,48 @@ class EmbeddingJoin(LLMJoin):
              with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
                 futures = []
                 for left_candidate, right_candidate, embedding_sim in join_candidates:
-                    llm_call_needed = (
-                        self.min_matching_sim == float("inf")
-                        or self.max_non_matching_sim == float("-inf")
-                        or self.min_matching_sim <= embedding_sim <= self.max_non_matching_sim
-                    )
+                    # if the embedding similarity is lower than the threshold below which no records joined,
+                    # then we can skip the LLM call and mark the records as not joined
+                    if embedding_sim < self.max_non_matching_sim:
+                        futures.append(executor.submit(self._process_join_candidate_with_sim, left_candidate, right_candidate, embedding_sim, passed_operator=False))
-                    if llm_call_needed:
-                        futures.append(executor.submit(self._process_join_candidate_pair, left_candidate, right_candidate, gen_kwargs, embedding_sim))
+                    # if the embedding similarity is higher than the threshold above which all records joined,
+                    # then we can skip the LLM call and mark the records as joined
+                    elif embedding_sim > self.min_matching_sim:
+                        futures.append(executor.submit(self._process_join_candidate_with_sim, left_candidate, right_candidate, embedding_sim, passed_operator=True))
-                    elif embedding_sim < self.min_matching_sim:
-                        self.join_idx += 1
-                        output_record, record_op_stats = self._process_join_candidate_with_sim(left_candidate, right_candidate, passed_operator=False)
-                        output_records.append(output_record)
-                        output_record_op_stats.append(record_op_stats)
-                        print(f"{self.join_idx} SKIPPED (low sim: {embedding_sim:.4f} < {self.min_matching_sim:.4f})")
-                    elif embedding_sim > self.max_non_matching_sim:
-                        self.join_idx += 1
-                        output_record, record_op_stats = self._process_join_candidate_with_sim(left_candidate, right_candidate, passed_operator=True)
-                        output_records.append(output_record)
-                        output_record_op_stats.append(record_op_stats)
-                        print(f"{self.join_idx} JOINED (high sim: {embedding_sim:.4f} > {self.max_non_matching_sim:.4f})")
+                    # otherwise, we will process the LLM call
+                    else:
+                        futures.append(executor.submit(self._process_join_candidate_pair, left_candidate, right_candidate, gen_kwargs, embedding_sim))
                     num_inputs_processed += 1
                 # collect results as they complete
+                similarities, joined = [], []
                 for future in as_completed(futures):
                     self.join_idx += 1
                     join_output_record, join_output_record_op_stats, embedding_sim = future.result()
                     output_records.append(join_output_record)
                     output_record_op_stats.append(join_output_record_op_stats)
+                    similarities.append(embedding_sim)
+                    joined.append(join_output_record._passed_operator)
                     print(f"{self.join_idx} JOINED")
-                    # update similarity thresholds
-                    records_joined = join_output_record._passed_operator
+                ### update thresholds if there are llm calls which incrementally squeeze the boundaries ###
+                # sort join results by embedding similarity
+                sorted_sim_join_tuples = sorted(zip(similarities, joined), key=lambda x: x[0])
+                # potentially update threshold below which no records joined
+                for embedding_sim, records_joined in sorted_sim_join_tuples:
+                    if records_joined:
+                        break
                     if not records_joined and embedding_sim > self.max_non_matching_sim:
                         self.max_non_matching_sim = embedding_sim
+                # potentially update threshold above which all records joined
+                for embedding_sim, records_joined in reversed(sorted_sim_join_tuples):
+                    if not records_joined:
+                        break
                     if records_joined and embedding_sim < self.min_matching_sim:
                         self.min_matching_sim = embedding_sim

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/query/operators/rag.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import time
+from typing import Any
 from numpy import dot
 from numpy.linalg import norm
@@ -153,9 +154,9 @@ class RAGConvert(LLMConvert):
             field = candidate.get_field_type(field_name)
             # skip this field if it is not a string or a list of strings
-            is_string_field = field.annotation in [str, str | None]
-            is_list_string_field = field.annotation in [list[str], list[str] | None]
-            if not (is_string_field or is_list_string_field):
+            is_string_field = field.annotation in [str, str | None, str | Any]
+            is_list_string_field = field.annotation in [list[str], list[str] | None, list[str] | Any]
+            if not (is_string_field or is_list_string_field) or candidate[field_name] is None:
                 continue
             # if this is a list of strings, join the strings
@@ -358,8 +359,8 @@ class RAGFilter(LLMFilter):
             field = candidate.get_field_type(field_name)
             # skip this field if it is not a string or a list of strings
-            is_string_field = field.annotation in [str, str | None]
-            is_list_string_field = field.annotation in [list[str], list[str] | None]
+            is_string_field = field.annotation in [str, str | None, str | Any]
+            is_list_string_field = field.annotation in [list[str], list[str] | None, list[str] | Any]
             if not (is_string_field or is_list_string_field):
                 continue

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/query/operators/topk.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import os
+import threading
 import time
 from typing import Callable
@@ -17,6 +18,24 @@ from palimpzest.core.models import GenerationStats, OperatorCostEstimates, Recor
 from palimpzest.query.operators.physical import PhysicalOperator
+class Singleton:
+     def __new__(cls, *args, **kw):
+         if not hasattr(cls, '_instance'):
+             orig = super(Singleton, cls)  # noqa: UP008
+             cls._instance = orig.__new__(cls, *args, **kw)
+         return cls._instance
+class ClipModel(Singleton):
+    model = None
+    lock = threading.Lock()
+    @classmethod
+    def get_model(cls, model_name: str):
+        with cls.lock:
+            if cls.model is None:
+                cls.model = SentenceTransformer(model_name)
+            return cls.model
 class TopKOp(PhysicalOperator):
     def __init__(
         self,
@@ -56,6 +75,7 @@ class TopKOp(PhysicalOperator):
         self.output_attrs = output_attrs
         self.search_func = search_func if search_func is not None else self.default_search_func
         self.k = k
+        self.clip_model = ClipModel()
     def __str__(self):
         op = super().__str__()
@@ -185,7 +205,6 @@ class TopKOp(PhysicalOperator):
         # construct and return the record set
         return DataRecordSet(drs, record_op_stats_lst)
     def __call__(self, candidate: DataRecord) -> DataRecordSet:
         start_time = time.time()
@@ -209,9 +228,9 @@ class TopKOp(PhysicalOperator):
         inputs, gen_stats = None, GenerationStats()
         if isinstance(self.index, Collection):
             uses_openai_embedding_fcn = isinstance(self.index._embedding_function, OpenAIEmbeddingFunction)
-            uses_sentence_transformer_embedding_fcn = isinstance(self.index._embedding_function, SentenceTransformerEmbeddingFunction)
+            uses_clip_model = isinstance(self.index._embedding_function, SentenceTransformerEmbeddingFunction)
             error_msg = "ChromaDB index must use OpenAI or SentenceTransformer embedding function; see: https://docs.trychroma.com/integrations/embedding-models/openai"
-            assert uses_openai_embedding_fcn or uses_sentence_transformer_embedding_fcn, error_msg
+            assert uses_openai_embedding_fcn or uses_clip_model, error_msg
             model_name = self.index._embedding_function.model_name if uses_openai_embedding_fcn else "clip-ViT-B-32"
             err_msg = f"For Chromadb, we currently only support `text-embedding-3-small` and `clip-ViT-B-32`; your index uses: {model_name}"
@@ -228,8 +247,8 @@ class TopKOp(PhysicalOperator):
                     total_input_tokens = response.usage.total_tokens
                     inputs = [item.embedding for item in response.data]
-                elif uses_sentence_transformer_embedding_fcn:
-                    model = SentenceTransformer(model_name)
+                elif uses_clip_model:
+                    model = self.clip_model.get_model(model_name)
                     inputs = model.encode(query)
                 embed_total_time = time.time() - embed_start_time

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/query/processor/config.py RENAMED Viewed

@@ -44,9 +44,11 @@ class QueryProcessorConfig(BaseModel):
     k: int = Field(default=6)
     j: int = Field(default=4)
     sample_budget: int = Field(default=100)
+    sample_cost_budget: float | None = Field(default=None)
     seed: int = Field(default=42)
     exp_name: str | None = Field(default=None)
     priors: dict | None = Field(default=None)
+    dont_use_priors: bool = Field(default=False)
     def to_dict(self) -> dict:
         """Convert the config to a dict representation."""

{palimpzest-1.0.0 → palimpzest-1.1.1}/src/palimpzest/utils/progress.py RENAMED Viewed

@@ -283,7 +283,7 @@ class PZProgressManager(ProgressManager):
         self.unique_full_op_id_to_stats[unique_full_op_id].memory_usage_mb = get_memory_usage()
 class PZSentinelProgressManager(ProgressManager):
-    def __init__(self, plan: SentinelPlan, sample_budget: int):
+    def __init__(self, plan: SentinelPlan, sample_budget: int | None, sample_cost_budget: float | None):
         # overall progress bar
         self.overall_progress = RichProgress(
             SpinnerColumn(),
@@ -298,7 +298,9 @@ class PZSentinelProgressManager(ProgressManager):
             refresh_per_second=10,
             expand=True,   # Use full width
         )
-        self.overall_task_id = self.overall_progress.add_task("", total=sample_budget, cost=0.0, recent="")
+        self.use_cost_budget = sample_cost_budget is not None
+        total = sample_cost_budget if self.use_cost_budget else sample_budget
+        self.overall_task_id = self.overall_progress.add_task("", total=total, cost=0.0, recent="")
         # logical operator progress bars
         self.op_progress = RichProgress(
@@ -334,6 +336,9 @@ class PZSentinelProgressManager(ProgressManager):
         # initialize start time
         self.start_time = None
+        # initialize validation cost
+        self.validation_cost = 0.0
         # add a task to the progress manager for each operator in the plan
         for topo_idx, (logical_op_id, op_set) in enumerate(plan):
             unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
@@ -387,15 +392,34 @@ class PZSentinelProgressManager(ProgressManager):
         # start progress bars
         self.live_display.start()
+    def incr_overall_progress_cost(self, cost_delta: float):
+        """Advance the overall progress bar by the given cost delta"""
+        self.validation_cost += cost_delta
+        self.overall_progress.update(
+            self.overall_task_id,
+            advance=cost_delta,
+            cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
+            refresh=True,
+        )
+        # force the live display to refresh
+        self.live_display.refresh()
     def incr(self, unique_logical_op_id: str, num_samples: int, display_text: str | None = None, **kwargs):
         # TODO: (above) organize progress bars into a Live / Table / Panel or something
         # get the task for the given operation
         task = self.unique_logical_op_id_to_task.get(unique_logical_op_id)
+        # store the cost before updating stats
+        previous_total_cost = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost
         # update statistics with any additional keyword arguments
         if kwargs != {}:
             self.update_stats(unique_logical_op_id, **kwargs)
+        # compute the cost delta
+        cost_delta = self.unique_logical_op_id_to_stats[unique_logical_op_id].total_cost - previous_total_cost
         # update progress bar and recent text in one update
         if display_text is not None:
             self.unique_logical_op_id_to_stats[unique_logical_op_id].recent_text = display_text
@@ -414,10 +438,11 @@ class PZSentinelProgressManager(ProgressManager):
         )
         # advance the overall progress bar
+        advance = cost_delta if self.use_cost_budget else num_samples
         self.overall_progress.update(
             self.overall_task_id,
-            advance=num_samples,
-            cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()),
+            advance=advance,
+            cost=sum(stats.total_cost for _, stats in self.unique_logical_op_id_to_stats.items()) + self.validation_cost,
             refresh=True,
         )
@@ -451,6 +476,7 @@ def create_progress_manager(
     plan: PhysicalPlan | SentinelPlan,
     num_samples: int | None = None,
     sample_budget: int | None = None,
+    sample_cost_budget: float | None = None,
     progress: bool = True,
 ) -> ProgressManager:
     """Factory function to create appropriate progress manager based on environment"""
@@ -458,7 +484,7 @@ def create_progress_manager(
         return MockProgressManager(plan, num_samples)
     if isinstance(plan, SentinelPlan):
-        assert sample_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
-        return PZSentinelProgressManager(plan, sample_budget)
+        assert sample_budget is not None or sample_cost_budget is not None, "Sample budget must be specified for SentinelPlan progress manager"
+        return PZSentinelProgressManager(plan, sample_budget, sample_cost_budget)
     return PZProgressManager(plan, num_samples)

{palimpzest-1.0.0 → palimpzest-1.1.1/src/palimpzest.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 1.0.0
+Version: 1.1.1
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org