PyPI - palimpzest - Versions diffs - 1.1.0__tar.gz → 1.1.1__tar.gz - Mend

palimpzest 1.1.0tar.gz → 1.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

{palimpzest-1.1.0/src/palimpzest.egg-info → palimpzest-1.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 1.1.0
+Version: 1.1.1
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org

{palimpzest-1.1.0 → palimpzest-1.1.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "palimpzest"
-version = "1.1.0"
+version = "1.1.1"
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
 requires-python = ">=3.12"

{palimpzest-1.1.0 → palimpzest-1.1.1}/src/palimpzest/prompts/prompt_factory.py RENAMED Viewed

@@ -830,7 +830,7 @@ class PromptFactory:
                 field_type = dr.get_field_type(field_name)
                 # audio filepath (or list of audio filepaths)
-                if field_type.annotation in [AudioFilepath, AudioFilepath | None, AudioFilepath | Any]:
+                if field_type.annotation in [AudioFilepath, AudioFilepath | None, AudioFilepath | Any] and field_value is not None:
                     with open(field_value, "rb") as f:
                         base64_audio_str = base64.b64encode(f.read()).decode("utf-8")
                     audio_content.append(
@@ -839,6 +839,8 @@ class PromptFactory:
                 elif field_type.annotation in [list[AudioFilepath], list[AudioFilepath] | None, list[AudioFilepath] | Any]:
                     for audio_filepath in field_value:
+                        if audio_filepath is None:
+                            continue
                         with open(audio_filepath, "rb") as f:
                             base64_audio_str = base64.b64encode(f.read()).decode("utf-8")
                         audio_content.append(
@@ -846,13 +848,15 @@ class PromptFactory:
                         )
                 # pre-encoded images (or list of pre-encoded images)
-                elif field_type.annotation in [AudioBase64, AudioBase64 | None, AudioBase64 | Any]:
+                elif field_type.annotation in [AudioBase64, AudioBase64 | None, AudioBase64 | Any] and field_value is not None:
                     audio_content.append(
                         {"type": "input_audio", "input_audio": {"data": field_value, "format": "wav"}}
                     )
                 elif field_type.annotation in [list[AudioBase64], list[AudioBase64] | None, list[AudioBase64] | Any]:
                     for base64_audio in field_value:
+                        if base64_audio is None:
+                            continue
                         audio_content.append(
                             {"type": "input_audio", "input_audio": {"data": base64_audio, "format": "wav"}}
                         )
@@ -882,7 +886,7 @@ class PromptFactory:
                 field_type = dr.get_field_type(field_name)
                 # image filepath (or list of image filepaths)
-                if field_type.annotation in [ImageFilepath, ImageFilepath | None, ImageFilepath | Any]:
+                if field_type.annotation in [ImageFilepath, ImageFilepath | None, ImageFilepath | Any] and field_value is not None:
                     with open(field_value, "rb") as f:
                         base64_image_str = base64.b64encode(f.read()).decode("utf-8")
                     image_content.append(
@@ -891,6 +895,8 @@ class PromptFactory:
                 elif field_type.annotation in [list[ImageFilepath], list[ImageFilepath] | None, list[ImageFilepath] | Any]:
                     for image_filepath in field_value:
+                        if image_filepath is None:
+                            continue
                         with open(image_filepath, "rb") as f:
                             base64_image_str = base64.b64encode(f.read()).decode("utf-8")
                         image_content.append(
@@ -898,21 +904,25 @@ class PromptFactory:
                         )
                 # image url (or list of image urls)
-                elif field_type.annotation in [ImageURL, ImageURL | None, ImageURL | Any]:
+                elif field_type.annotation in [ImageURL, ImageURL | None, ImageURL | Any] and field_value is not None:
                     image_content.append({"type": "image_url", "image_url": {"url": field_value}})
                 elif field_type.annotation in [list[ImageURL], list[ImageURL] | None, list[ImageURL] | Any]:
                     for image_url in field_value:
+                        if image_url is None:
+                            continue
                         image_content.append({"type": "image_url", "image_url": {"url": image_url}})
                 # pre-encoded images (or list of pre-encoded images)
-                elif field_type.annotation in [ImageBase64, ImageBase64 | None, ImageBase64 | Any]:
+                elif field_type.annotation in [ImageBase64, ImageBase64 | None, ImageBase64 | Any] and field_value is not None:
                     image_content.append(
                         {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{field_value}"}}
                     )
                 elif field_type.annotation in [list[ImageBase64], list[ImageBase64] | None, list[ImageBase64] | Any]:
                     for base64_image in field_value:
+                        if base64_image is None:
+                            continue
                         image_content.append(
                             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                         )

{palimpzest-1.1.0 → palimpzest-1.1.1}/src/palimpzest/query/execution/execution_strategy.py RENAMED Viewed

@@ -91,6 +91,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         use_final_op_quality: bool = False,
         seed: int = 42,
         exp_name: str | None = None,
+        dont_use_priors: bool = False,
         *args,
         **kwargs,
     ):
@@ -105,6 +106,7 @@ class SentinelExecutionStrategy(BaseExecutionStrategy, ABC):
         self.seed = seed
         self.rng = np.random.default_rng(seed=seed)
         self.exp_name = exp_name
+        self.dont_use_priors = dont_use_priors
         # general cache which maps hash(logical_op_id, phys_op_id, hash(input)) --> record_set
         self.cache: dict[int, DataRecordSet] = {}

{palimpzest-1.1.0 → palimpzest-1.1.1}/src/palimpzest/query/execution/mab_execution_strategy.py RENAMED Viewed

@@ -44,6 +44,7 @@ class OpFrontier:
             seed: int,
             policy: Policy,
             priors: dict | None = None,
+            dont_use_priors: bool = False,
         ):
         # set k and j, which are the initial number of operators in the frontier and the
         # initial number of records to sample for each frontier operator
@@ -51,6 +52,7 @@ class OpFrontier:
         self.j = j
         self.source_indices = source_indices
         self.root_dataset_ids = root_dataset_ids
+        self.dont_use_priors = dont_use_priors
         # store the policy that we are optimizing under
         self.policy = policy
@@ -68,6 +70,7 @@ class OpFrontier:
         is_llm_filter = isinstance(sample_op, LLMFilter)
         is_llm_topk = isinstance(sample_op, TopKOp) and isinstance(sample_op.index, Collection)
         self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_topk or self.is_llm_join
+        self.is_llm_convert = is_llm_convert
         # get order in which we will sample physical operators for this logical operator
         sample_op_indices = self._get_op_index_order(op_set, seed)
@@ -190,7 +193,9 @@ class OpFrontier:
         Returns a list of indices for the operators in the op_set.
         """
         # if this is not an llm-operator, we simply return the indices in random order
-        if not self.is_llm_op:
+        if not self.is_llm_op or self.dont_use_priors:
+            if self.is_llm_convert:
+                print("Using NO PRIORS for operator sampling order")
             rng = np.random.default_rng(seed=seed)
             op_indices = np.arange(len(op_set))
             rng.shuffle(op_indices)
@@ -198,6 +203,8 @@ class OpFrontier:
         # if this is an llm-operator, but we do not have priors, we first compute naive priors
         if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
+            if self.is_llm_convert:
+                print("Using NAIVE PRIORS for operator sampling order")
             self.priors = self._compute_naive_priors(op_set)
         # NOTE: self.priors is a dictionary with format:
@@ -805,7 +812,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 assert len(root_dataset_ids) == 1, f"Scan for {sample_op} has {len(root_dataset_ids)} > 1 root dataset ids"
                 root_dataset_id = root_dataset_ids[0]
                 source_indices = dataset_id_to_shuffled_source_indices[root_dataset_id]
-                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors)
+                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors, self.dont_use_priors)
             elif isinstance(sample_op, JoinOp):
                 assert len(source_unique_logical_op_ids) == 2, f"Join for {sample_op} has {len(source_unique_logical_op_ids)} != 2 source logical operators"
                 left_source_indices = op_frontiers[source_unique_logical_op_ids[0]].source_indices
@@ -814,10 +821,10 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 for left_source_idx in left_source_indices:
                     for right_source_idx in right_source_indices:
                         source_indices.append((left_source_idx, right_source_idx))
-                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors)
+                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors, self.dont_use_priors)
             else:
                 source_indices = op_frontiers[source_unique_logical_op_ids[0]].source_indices
-                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors)
+                op_frontiers[unique_logical_op_id] = OpFrontier(op_set, source_unique_logical_op_ids, root_dataset_ids, source_indices, self.k, self.j, self.seed, self.policy, self.priors, self.dont_use_priors)
         # initialize and start the progress manager
         self.progress_manager = create_progress_manager(plan, sample_budget=self.sample_budget, sample_cost_budget=self.sample_cost_budget, progress=self.progress)

{palimpzest-1.1.0 → palimpzest-1.1.1}/src/palimpzest/query/operators/join.py RENAMED Viewed

@@ -27,6 +27,25 @@ from palimpzest.query.generators.generators import Generator
 from palimpzest.query.operators.physical import PhysicalOperator
+class Singleton:
+     def __new__(cls, *args, **kw):
+         if not hasattr(cls, '_instance'):
+             orig = super(Singleton, cls)  # noqa: UP008
+             cls._instance = orig.__new__(cls, *args, **kw)
+         return cls._instance
+class Locks(Singleton):
+    model = None
+    clip_lock = threading.Lock()
+    exec_lock = threading.Lock()
+    @classmethod
+    def get_model(cls, model_name: str):
+        with cls.clip_lock:
+            if cls.model is None:
+                cls.model = SentenceTransformer(model_name)
+            return cls.model
 def compute_similarity(left_embedding: list[float], right_embedding: list[float]) -> float:
     """
     Compute the similarity between two embeddings using cosine similarity.
@@ -487,8 +506,7 @@ class EmbeddingJoin(LLMJoin):
             if field_name.split(".")[-1] in self.get_input_fields()
         ])
         self.embedding_model = Model.TEXT_EMBEDDING_3_SMALL if self.text_only else Model.CLIP_VIT_B_32
-        self.clip_model = None
-        self._lock = threading.Lock()
+        self.locks = Locks()
         # keep track of embedding costs that could not be amortized if no output records were produced
         self.residual_embedding_cost = 0.0
@@ -560,12 +578,6 @@ class EmbeddingJoin(LLMJoin):
             quality=quality,
         )
-    def _get_clip_model(self):
-        with self._lock:
-            if self.clip_model is None:
-                self.clip_model = SentenceTransformer(self.embedding_model.value)
-            return self.clip_model
     def _compute_embeddings(self, candidates: list[DataRecord], input_fields: list[str]) -> tuple[np.ndarray, GenerationStats]:
         # return empty array and empty stats if no candidates
         if len(candidates) == 0:
@@ -581,7 +593,7 @@ class EmbeddingJoin(LLMJoin):
             total_input_tokens = response.usage.total_tokens
             embeddings = np.array([item.embedding for item in response.data])
         else:
-            model = self._get_clip_model()
+            model = self.locks.get_model(self.embedding_model.value)
             embeddings = np.zeros((len(candidates), 512))  # CLIP embeddings are 512-dimensional
             num_input_fields_present = 0
             for field in input_fields:
@@ -623,7 +635,7 @@ class EmbeddingJoin(LLMJoin):
         output_record, output_record_op_stats = super()._process_join_candidate_pair(left_candidate, right_candidate, gen_kwargs)
         return output_record, output_record_op_stats, embedding_sim
-    def _process_join_candidate_with_sim(self, left_candidate: DataRecord, right_candidate: DataRecord, passed_operator: bool) -> tuple[DataRecord, RecordOpStats]:
+    def _process_join_candidate_with_sim(self, left_candidate: DataRecord, right_candidate: DataRecord, embedding_sim: float, passed_operator: bool) -> tuple[DataRecord, RecordOpStats]:
         # compute output record and add to output_records
         join_dr = DataRecord.from_join_parents(self.output_schema, left_candidate, right_candidate)
         join_dr._passed_operator = passed_operator
@@ -656,7 +668,7 @@ class EmbeddingJoin(LLMJoin):
             op_details={k: str(v) for k, v in self.get_id_params().items()},
         )
-        return join_dr, record_op_stats
+        return join_dr, record_op_stats, embedding_sim
     def __call__(self, left_candidates: list[DataRecord], right_candidates: list[DataRecord], final: bool = False) -> tuple[DataRecordSet, int]:
         # get the set of input fields from both records in the join
@@ -690,36 +702,50 @@ class EmbeddingJoin(LLMJoin):
         output_records, output_record_op_stats, num_inputs_processed = [], [], 0
         # draw samples until num_samples is reached
-        if self.samples_drawn < self.num_samples:
-            samples_to_draw = min(self.num_samples - self.samples_drawn, len(join_candidates))
-            join_candidate_samples = join_candidates[:samples_to_draw]
-            join_candidates = join_candidates[samples_to_draw:]
-            # apply the generator to each pair of candidates
-            with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
-                futures = [
-                    executor.submit(self._process_join_candidate_pair, left_candidate, right_candidate, gen_kwargs, embedding_sim)
-                    for left_candidate, right_candidate, embedding_sim in join_candidate_samples
-                ]
-                # collect results as they complete
-                for future in as_completed(futures):
-                    self.join_idx += 1
-                    join_output_record, join_output_record_op_stats, embedding_sim = future.result()
-                    output_records.append(join_output_record)
-                    output_record_op_stats.append(join_output_record_op_stats)
-                    print(f"{self.join_idx} JOINED")
-                    # update similarity thresholds
-                    records_joined = join_output_record._passed_operator
-                    if not records_joined and embedding_sim > self.max_non_matching_sim:
-                        self.max_non_matching_sim = embedding_sim
-                    if records_joined and embedding_sim < self.min_matching_sim:
-                        self.min_matching_sim = embedding_sim
-            # update samples drawn and num_inputs_processed
-            self.samples_drawn += samples_to_draw
-            num_inputs_processed += samples_to_draw
+        with self.locks.exec_lock:
+            if self.samples_drawn < self.num_samples:
+                samples_to_draw = min(self.num_samples - self.samples_drawn, len(join_candidates))
+                join_candidate_samples = join_candidates[:samples_to_draw]
+                join_candidates = join_candidates[samples_to_draw:]
+                # apply the generator to each pair of candidates
+                with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
+                    futures = [
+                        executor.submit(self._process_join_candidate_pair, left_candidate, right_candidate, gen_kwargs, embedding_sim)
+                        for left_candidate, right_candidate, embedding_sim in join_candidate_samples
+                    ]
+                    # collect results as they complete
+                    similarities, joined = [], []
+                    for future in as_completed(futures):
+                        self.join_idx += 1
+                        join_output_record, join_output_record_op_stats, embedding_sim = future.result()
+                        output_records.append(join_output_record)
+                        output_record_op_stats.append(join_output_record_op_stats)
+                        similarities.append(embedding_sim)
+                        joined.append(join_output_record._passed_operator)
+                        print(f"{self.join_idx} JOINED")
+                    # sort join results by embedding similarity
+                    sorted_sim_join_tuples = sorted(zip(similarities, joined), key=lambda x: x[0])
+                    # compute threshold below which no records joined
+                    for embedding_sim, records_joined in sorted_sim_join_tuples:
+                        if records_joined:
+                            break
+                        if not records_joined and embedding_sim > self.max_non_matching_sim:
+                            self.max_non_matching_sim = embedding_sim
+                    # compute threshold above which all records joined
+                    for embedding_sim, records_joined in reversed(sorted_sim_join_tuples):
+                        if not records_joined:
+                            break
+                        if records_joined and embedding_sim < self.min_matching_sim:
+                            self.min_matching_sim = embedding_sim
+                # update samples drawn and num_inputs_processed
+                self.samples_drawn += samples_to_draw
+                num_inputs_processed += samples_to_draw
         # process remaining candidates based on embedding similarity
         if len(join_candidates) > 0:
@@ -727,43 +753,48 @@ class EmbeddingJoin(LLMJoin):
              with ThreadPoolExecutor(max_workers=self.join_parallelism) as executor:
                 futures = []
                 for left_candidate, right_candidate, embedding_sim in join_candidates:
-                    llm_call_needed = (
-                        self.min_matching_sim == float("inf")
-                        or self.max_non_matching_sim == float("-inf")
-                        or self.min_matching_sim <= embedding_sim <= self.max_non_matching_sim
-                    )
+                    # if the embedding similarity is lower than the threshold below which no records joined,
+                    # then we can skip the LLM call and mark the records as not joined
+                    if embedding_sim < self.max_non_matching_sim:
+                        futures.append(executor.submit(self._process_join_candidate_with_sim, left_candidate, right_candidate, embedding_sim, passed_operator=False))
-                    if llm_call_needed:
-                        futures.append(executor.submit(self._process_join_candidate_pair, left_candidate, right_candidate, gen_kwargs, embedding_sim))
+                    # if the embedding similarity is higher than the threshold above which all records joined,
+                    # then we can skip the LLM call and mark the records as joined
+                    elif embedding_sim > self.min_matching_sim:
+                        futures.append(executor.submit(self._process_join_candidate_with_sim, left_candidate, right_candidate, embedding_sim, passed_operator=True))
-                    elif embedding_sim < self.min_matching_sim:
-                        self.join_idx += 1
-                        output_record, record_op_stats = self._process_join_candidate_with_sim(left_candidate, right_candidate, passed_operator=False)
-                        output_records.append(output_record)
-                        output_record_op_stats.append(record_op_stats)
-                        print(f"{self.join_idx} SKIPPED (low sim: {embedding_sim:.4f} < {self.min_matching_sim:.4f})")
-                    elif embedding_sim > self.max_non_matching_sim:
-                        self.join_idx += 1
-                        output_record, record_op_stats = self._process_join_candidate_with_sim(left_candidate, right_candidate, passed_operator=True)
-                        output_records.append(output_record)
-                        output_record_op_stats.append(record_op_stats)
-                        print(f"{self.join_idx} JOINED (high sim: {embedding_sim:.4f} > {self.max_non_matching_sim:.4f})")
+                    # otherwise, we will process the LLM call
+                    else:
+                        futures.append(executor.submit(self._process_join_candidate_pair, left_candidate, right_candidate, gen_kwargs, embedding_sim))
                     num_inputs_processed += 1
                 # collect results as they complete
+                similarities, joined = [], []
                 for future in as_completed(futures):
                     self.join_idx += 1
                     join_output_record, join_output_record_op_stats, embedding_sim = future.result()
                     output_records.append(join_output_record)
                     output_record_op_stats.append(join_output_record_op_stats)
+                    similarities.append(embedding_sim)
+                    joined.append(join_output_record._passed_operator)
                     print(f"{self.join_idx} JOINED")
-                    # update similarity thresholds
-                    records_joined = join_output_record._passed_operator
+                ### update thresholds if there are llm calls which incrementally squeeze the boundaries ###
+                # sort join results by embedding similarity
+                sorted_sim_join_tuples = sorted(zip(similarities, joined), key=lambda x: x[0])
+                # potentially update threshold below which no records joined
+                for embedding_sim, records_joined in sorted_sim_join_tuples:
+                    if records_joined:
+                        break
                     if not records_joined and embedding_sim > self.max_non_matching_sim:
                         self.max_non_matching_sim = embedding_sim
+                # potentially update threshold above which all records joined
+                for embedding_sim, records_joined in reversed(sorted_sim_join_tuples):
+                    if not records_joined:
+                        break
                     if records_joined and embedding_sim < self.min_matching_sim:
                         self.min_matching_sim = embedding_sim

{palimpzest-1.1.0 → palimpzest-1.1.1}/src/palimpzest/query/operators/rag.py RENAMED Viewed

@@ -156,7 +156,7 @@ class RAGConvert(LLMConvert):
             # skip this field if it is not a string or a list of strings
             is_string_field = field.annotation in [str, str | None, str | Any]
             is_list_string_field = field.annotation in [list[str], list[str] | None, list[str] | Any]
-            if not (is_string_field or is_list_string_field):
+            if not (is_string_field or is_list_string_field) or candidate[field_name] is None:
                 continue
             # if this is a list of strings, join the strings

{palimpzest-1.1.0 → palimpzest-1.1.1}/src/palimpzest/query/operators/topk.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import os
+import threading
 import time
 from typing import Callable
@@ -17,6 +18,24 @@ from palimpzest.core.models import GenerationStats, OperatorCostEstimates, Recor
 from palimpzest.query.operators.physical import PhysicalOperator
+class Singleton:
+     def __new__(cls, *args, **kw):
+         if not hasattr(cls, '_instance'):
+             orig = super(Singleton, cls)  # noqa: UP008
+             cls._instance = orig.__new__(cls, *args, **kw)
+         return cls._instance
+class ClipModel(Singleton):
+    model = None
+    lock = threading.Lock()
+    @classmethod
+    def get_model(cls, model_name: str):
+        with cls.lock:
+            if cls.model is None:
+                cls.model = SentenceTransformer(model_name)
+            return cls.model
 class TopKOp(PhysicalOperator):
     def __init__(
         self,
@@ -56,6 +75,7 @@ class TopKOp(PhysicalOperator):
         self.output_attrs = output_attrs
         self.search_func = search_func if search_func is not None else self.default_search_func
         self.k = k
+        self.clip_model = ClipModel()
     def __str__(self):
         op = super().__str__()
@@ -185,7 +205,6 @@ class TopKOp(PhysicalOperator):
         # construct and return the record set
         return DataRecordSet(drs, record_op_stats_lst)
     def __call__(self, candidate: DataRecord) -> DataRecordSet:
         start_time = time.time()
@@ -209,9 +228,9 @@ class TopKOp(PhysicalOperator):
         inputs, gen_stats = None, GenerationStats()
         if isinstance(self.index, Collection):
             uses_openai_embedding_fcn = isinstance(self.index._embedding_function, OpenAIEmbeddingFunction)
-            uses_sentence_transformer_embedding_fcn = isinstance(self.index._embedding_function, SentenceTransformerEmbeddingFunction)
+            uses_clip_model = isinstance(self.index._embedding_function, SentenceTransformerEmbeddingFunction)
             error_msg = "ChromaDB index must use OpenAI or SentenceTransformer embedding function; see: https://docs.trychroma.com/integrations/embedding-models/openai"
-            assert uses_openai_embedding_fcn or uses_sentence_transformer_embedding_fcn, error_msg
+            assert uses_openai_embedding_fcn or uses_clip_model, error_msg
             model_name = self.index._embedding_function.model_name if uses_openai_embedding_fcn else "clip-ViT-B-32"
             err_msg = f"For Chromadb, we currently only support `text-embedding-3-small` and `clip-ViT-B-32`; your index uses: {model_name}"
@@ -228,8 +247,8 @@ class TopKOp(PhysicalOperator):
                     total_input_tokens = response.usage.total_tokens
                     inputs = [item.embedding for item in response.data]
-                elif uses_sentence_transformer_embedding_fcn:
-                    model = SentenceTransformer(model_name)
+                elif uses_clip_model:
+                    model = self.clip_model.get_model(model_name)
                     inputs = model.encode(query)
                 embed_total_time = time.time() - embed_start_time

{palimpzest-1.1.0 → palimpzest-1.1.1}/src/palimpzest/query/processor/config.py RENAMED Viewed

@@ -48,6 +48,7 @@ class QueryProcessorConfig(BaseModel):
     seed: int = Field(default=42)
     exp_name: str | None = Field(default=None)
     priors: dict | None = Field(default=None)
+    dont_use_priors: bool = Field(default=False)
     def to_dict(self) -> dict:
         """Convert the config to a dict representation."""

{palimpzest-1.1.0 → palimpzest-1.1.1/src/palimpzest.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 1.1.0
+Version: 1.1.1
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org