PyPI - palimpzest - Versions diffs - 1.1.1__tar.gz → 1.3.0__tar.gz - Mend

palimpzest 1.1.1tar.gz → 1.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

{palimpzest-1.1.1/src/palimpzest.egg-info → palimpzest-1.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 1.1.1
+Version: 1.3.0
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org
@@ -31,9 +31,10 @@ Requires-Dist: pillow>=11.3.0
 Requires-Dist: prettytable>=3.9.0
 Requires-Dist: psutil==5.9.5
 Requires-Dist: PyLD>=2.0.4
-Requires-Dist: pyarrow==20.0.0
+Requires-Dist: pyarrow>=20.0.0
 Requires-Dist: pypdf>=5.1.0
 Requires-Dist: pytest-mock>=3.14.0
+Requires-Dist: python-dotenv>=1.2.1
 Requires-Dist: pyyaml>=6.0.1
 Requires-Dist: requests>=2.25
 Requires-Dist: ruff>=0.9.0

{palimpzest-1.1.1 → palimpzest-1.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "palimpzest"
-version = "1.1.1"
+version = "1.3.0"
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -25,9 +25,10 @@ dependencies = [
     "prettytable>=3.9.0",
     "psutil==5.9.5",
     "PyLD>=2.0.4",
-    "pyarrow==20.0.0",
+    "pyarrow>=20.0.0",
     "pypdf>=5.1.0",
     "pytest-mock>=3.14.0",
+    "python-dotenv>=1.2.1",
     "pyyaml>=6.0.1",
     "requests>=2.25",
     "ruff>=0.9.0",

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/constants.py RENAMED Viewed

@@ -31,9 +31,9 @@ class Model(str, Enum):
     GEMINI_2_0_FLASH = "vertex_ai/gemini-2.0-flash"
     GEMINI_2_5_FLASH = "vertex_ai/gemini-2.5-flash"
     GEMINI_2_5_PRO = "vertex_ai/gemini-2.5-pro"
-    GOOGLE_GEMINI_2_5_FLASH = "google/gemini-2.5-flash"
-    GOOGLE_GEMINI_2_5_FLASH_LITE = "google/gemini-2.5-flash-lite"
-    GOOGLE_GEMINI_2_5_PRO = "google/gemini-2.5-pro"
+    GOOGLE_GEMINI_2_5_FLASH = "gemini/gemini-2.5-flash"
+    GOOGLE_GEMINI_2_5_FLASH_LITE = "gemini/gemini-2.5-flash-lite"
+    GOOGLE_GEMINI_2_5_PRO = "gemini/gemini-2.5-pro"
     LLAMA_4_MAVERICK = "vertex_ai/meta/llama-4-maverick-17b-128e-instruct-maas"
     GPT_4o_AUDIO_PREVIEW = "openai/gpt-4o-audio-preview"
     GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
@@ -72,8 +72,8 @@ class Model(str, Enum):
     def is_vertex_model(self):
         return "vertex_ai" in self.value.lower()
-    def is_google_model(self):
-        return "google" in self.value.lower()
+    def is_google_ai_studio_model(self):
+        return "gemini/" in self.value.lower()
     def is_vllm_model(self):
         return "hosted_vllm" in self.value.lower()

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/models.py RENAMED Viewed

@@ -35,12 +35,18 @@ class GenerationStats(BaseModel):
     # typed as a float because GenerationStats may be amortized (i.e. divided) across a number of output records
     total_output_tokens: float = 0.0
+    # the total number of input tokens processed by embedding models
+    total_embedding_input_tokens: float = 0.0
     # the total cost of processing the input tokens; None if this operation did not use an LLM
     total_input_cost: float = 0.0
     # the total cost of processing the output tokens; None if this operation did not use an LLM
     total_output_cost: float = 0.0
+    # the total cost of processing input tokens for embedding models
+    total_embedding_cost: float = 0.0
     # the total cost of processing the input and output tokens; None if this operation did not use an LLM
     cost_per_record: float = 0.0
@@ -68,6 +74,9 @@ class GenerationStats(BaseModel):
             "fn_call_duration_secs",
             "total_llm_calls",
             "total_embedding_llm_calls",
+            "total_embedding_input_tokens",
+            "total_embedding_cost"
         ]:
             setattr(self, model_field, getattr(self, model_field) + getattr(other, model_field))
         return self
@@ -85,6 +94,8 @@ class GenerationStats(BaseModel):
                 "cost_per_record",
                 "total_llm_calls",
                 "total_embedding_llm_calls",
+                "total_embedding_input_tokens",
+                "total_embedding_cost"
             ]
         }
         # dct['raw_answers'] = self.raw_answers + other.raw_answers
@@ -107,6 +118,8 @@ class GenerationStats(BaseModel):
             "fn_call_duration_secs",
             "total_llm_calls",
             "total_embedding_llm_calls",
+            "total_embedding_input_tokens",
+            "total_embedding_cost"
         ]:
             setattr(self, model_field, getattr(self, model_field) / quotient)
         return self
@@ -128,6 +141,8 @@ class GenerationStats(BaseModel):
                 "total_llm_calls",
                 "total_embedding_llm_calls",
                 "cost_per_record",
+                "total_embedding_input_tokens",
+                "total_embedding_cost"
             ]
         }
         dct["model_name"] = self.model_name
@@ -217,6 +232,10 @@ class RecordOpStats(BaseModel):
     # typed as a float because GenerationStats may be amortized (i.e. divided) across a number of output records
     total_output_tokens: float = 0.0
+    # the total number of input tokens processed by embedding models
+    # typed as a float because GenerationStats may be amortized (i.e. divided) across a number of output records
+    total_embedding_input_tokens: float = 0.0
     # the total cost of processing the input tokens; None if this operation did not use an LLM
     total_input_cost: float = 0.0
@@ -278,6 +297,9 @@ class OperatorStats(BaseModel):
     # the total output tokens processed by this operation
     total_output_tokens: int = 0
+    #the total embedding input tokens processed by this operation
+    total_embedding_input_tokens: int = 0
     # a list of RecordOpStats processed by the operation
     record_op_stats_lst: list[RecordOpStats] = Field(default_factory=list)
@@ -309,6 +331,7 @@ class OperatorStats(BaseModel):
             self.total_op_cost += stats.total_op_cost
             self.total_input_tokens += stats.total_input_tokens
             self.total_output_tokens += stats.total_output_tokens
+            self.total_embedding_input_tokens += stats.total_embedding_input_tokens
             self.record_op_stats_lst.extend(stats.record_op_stats_lst)
         elif isinstance(stats, RecordOpStats):
@@ -319,6 +342,7 @@ class OperatorStats(BaseModel):
             self.total_op_cost += stats.cost_per_record
             self.total_input_tokens += stats.total_input_tokens
             self.total_output_tokens += stats.total_output_tokens
+            self.total_embedding_input_tokens += stats.total_embedding_input_tokens
         else:
             raise TypeError(f"Cannot add {type(stats)} to OperatorStats")
@@ -370,6 +394,9 @@ class BasePlanStats(BaseModel):
     # total output tokens processed by this plan
     total_output_tokens: int = 0
+    # total embedding input tokens processed by this plan
+    total_embedding_input_tokens: int = 0
     # start time for the plan execution; should be set by calling PlanStats.start()
     start_time: float | None = None
@@ -385,6 +412,7 @@ class BasePlanStats(BaseModel):
         self.total_plan_cost = self.sum_op_costs() + self.sum_validation_costs()
         self.total_input_tokens = self.sum_input_tokens() + self.sum_validation_input_tokens()
         self.total_output_tokens = self.sum_output_tokens() + self.sum_validation_output_tokens()
+        self.total_embedding_input_tokens = self.sum_embedding_input_tokens() + self.sum_validation_embedding_input_tokens()
     @staticmethod
     @abstractmethod
@@ -415,6 +443,13 @@ class BasePlanStats(BaseModel):
         """
         pass
+    @abstractmethod
+    def sum_embedding_input_tokens(self) -> int:
+        """
+        Sum the input embedding tokens processed by all operators in this plan.
+        """
+        pass
     @abstractmethod
     def add_record_op_stats(self, unique_full_op_id: str, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
         """
@@ -453,6 +488,12 @@ class BasePlanStats(BaseModel):
         Sum the output tokens processed by all validation generations in this plan.
         """
         return sum([gen_stats.total_output_tokens for _, gen_stats in self.validation_gen_stats.items()])
+    def sum_validation_embedding_input_tokens(self) -> int:
+        """
+        Sum the input embedding tokens processed by all validation generations in this plan.
+        """
+        return sum([gen_stats.total_embedding_input_tokens for _, gen_stats in self.validation_gen_stats.items()])
     def get_total_cost_so_far(self) -> float:
         """
@@ -501,6 +542,12 @@ class PlanStats(BasePlanStats):
         Sum the output tokens processed by all operators in this plan.
         """
         return sum([op_stats.total_output_tokens for _, op_stats in self.operator_stats.items()])
+    def sum_embedding_input_tokens(self) -> int:
+        """
+        Sum the input embedding tokens processed by all operators in this plan.
+        """
+        return sum([op_stats.total_embedding_input_tokens for _, op_stats in self.operator_stats.items()])
     def add_record_op_stats(self, unique_full_op_id: str, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
         """
@@ -528,6 +575,7 @@ class PlanStats(BasePlanStats):
         self.total_plan_cost += plan_stats.total_plan_cost
         self.total_input_tokens += plan_stats.total_input_tokens
         self.total_output_tokens += plan_stats.total_output_tokens
+        self.total_embedding_input_tokens += plan_stats.total_embedding_input_tokens
         for unique_full_op_id, op_stats in plan_stats.operator_stats.items():
             if unique_full_op_id in self.operator_stats:
                 self.operator_stats[unique_full_op_id] += op_stats
@@ -539,6 +587,7 @@ class PlanStats(BasePlanStats):
         stats += f"total_plan_cost={self.total_plan_cost} \n"
         stats += f"total_input_tokens={self.total_input_tokens} \n"
         stats += f"total_output_tokens={self.total_output_tokens} \n"
+        stats += f"total_embedding_input_tokens={self.total_embedding_input_tokens} \n"
         for idx, op_stats in enumerate(self.operator_stats.values()):
             stats += f"{idx}. {op_stats.op_name} time={op_stats.total_op_time} cost={op_stats.total_op_cost} \n"
         return stats
@@ -586,6 +635,12 @@ class SentinelPlanStats(BasePlanStats):
         Sum the output tokens processed by all operators in this plan.
         """
         return sum(sum([op_stats.total_output_tokens for _, op_stats in phys_op_stats.items()]) for _, phys_op_stats in self.operator_stats.items())
+    def sum_embedding_input_tokens(self) -> int:
+        """
+        Sum the output tokens processed by all operators in this plan.
+        """
+        return sum(sum([op_stats.total_embedding_input_tokens for _, op_stats in phys_op_stats.items()]) for _, phys_op_stats in self.operator_stats.items())
     def add_record_op_stats(self, unique_logical_op_id: str, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
         """
@@ -627,6 +682,7 @@ class SentinelPlanStats(BasePlanStats):
         self.total_plan_cost += plan_stats.total_plan_cost
         self.total_input_tokens += plan_stats.total_input_tokens
         self.total_output_tokens += plan_stats.total_output_tokens
+        self.total_embedding_input_tokens += plan_stats.total_embedding_input_tokens
         for unique_logical_op_id, physical_op_stats in plan_stats.operator_stats.items():
             for full_op_id, op_stats in physical_op_stats.items():
                 if unique_logical_op_id in self.operator_stats:
@@ -648,6 +704,7 @@ class SentinelPlanStats(BasePlanStats):
         stats += f"total_plan_cost={self.total_plan_cost} \n"
         stats += f"total_input_tokens={self.total_input_tokens} \n"
         stats += f"total_output_tokens={self.total_output_tokens} \n"
+        stats += f"total_embedding_input_tokens={self.total_embedding_input_tokens} \n"
         for outer_idx, physical_op_stats in enumerate(self.operator_stats.values()):
             total_time = sum([op_stats.total_op_time for op_stats in physical_op_stats.values()])
             total_cost = sum([op_stats.total_op_cost for op_stats in physical_op_stats.values()])
@@ -695,6 +752,9 @@ class ExecutionStats(BaseModel):
     # total number of output tokens processed
     total_output_tokens: int = 0
+     # total number of embedding input tokens processed
+    total_embedding_input_tokens: int = 0
     # total number of tokens processed
     total_tokens: int = 0
@@ -748,7 +808,8 @@ class ExecutionStats(BaseModel):
         # compute the tokens for total execution
         self.total_input_tokens = self.sum_input_tokens()
         self.total_output_tokens = self.sum_output_tokens()
-        self.total_tokens = self.total_input_tokens + self.total_output_tokens
+        self.total_embedding_input_tokens = self.sum_embedding_input_tokens()
+        self.total_tokens = self.total_input_tokens + self.total_output_tokens + self.total_embedding_input_tokens
         # compute plan_strs
         self.plan_strs = {plan_id: plan_stats.plan_str for plan_id, plan_stats in self.plan_stats.items()}
@@ -780,6 +841,15 @@ class ExecutionStats(BaseModel):
         sentinel_plan_output_tokens = sum([plan_stats.sum_output_tokens() for _, plan_stats in self.sentinel_plan_stats.items()])
         plan_output_tokens = sum([plan_stats.sum_output_tokens() for _, plan_stats in self.plan_stats.items()])
         return plan_output_tokens + sentinel_plan_output_tokens
+    def sum_embedding_input_tokens(self) -> int:
+        """
+        Sum the embedding input tokens processed in this execution
+        """
+        sentinel_plan_embedding_input_tokens = sum([plan_stats.sum_embedding_input_tokens() for _, plan_stats in self.sentinel_plan_stats.items()])
+        plan_embedding_input_tokens = sum([plan_stats.sum_embedding_input_tokens() for _, plan_stats in self.plan_stats.items()])
+        return plan_embedding_input_tokens + sentinel_plan_embedding_input_tokens
     def add_plan_stats(self, plan_stats: PlanStats | SentinelPlanStats | list[PlanStats] | list[SentinelPlanStats]) -> None:
         """

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/execution/mab_execution_strategy.py RENAMED Viewed

@@ -777,7 +777,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
                 # if the operator is a non-llm filter which has filtered out records, remove those records from
                 # all downstream operators' full_op_id_to_sources_not_processed
-                if isinstance(op_set[0], NonLLMFilter):
+                if isinstance(op_set[0], NonLLMFilter) and next_unique_logical_op_id is not None:
                     self._remove_filtered_records_from_downstream_ops(topo_idx, plan, op_frontiers, source_indices_to_all_record_sets)
         # finalize plan stats

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/convert.py RENAMED Viewed

@@ -121,8 +121,10 @@ class ConvertOp(PhysicalOperator, ABC):
                 generated_fields=field_names,
                 total_input_tokens=per_record_stats.total_input_tokens,
                 total_output_tokens=per_record_stats.total_output_tokens,
+                total_embedding_input_tokens=per_record_stats.total_embedding_input_tokens,
                 total_input_cost=per_record_stats.total_input_cost,
                 total_output_cost=per_record_stats.total_output_cost,
+                total_embedding_cost=per_record_stats.total_embedding_cost,
                 llm_call_duration_secs=per_record_stats.llm_call_duration_secs,
                 fn_call_duration_secs=per_record_stats.fn_call_duration_secs,
                 total_llm_calls=per_record_stats.total_llm_calls,

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/filter.py RENAMED Viewed

@@ -89,8 +89,10 @@ class FilterOp(PhysicalOperator, ABC):
             filter_str=self.filter_obj.get_filter_str(),
             total_input_tokens=generation_stats.total_input_tokens,
             total_output_tokens=generation_stats.total_output_tokens,
+            total_embedding_input_tokens=generation_stats.total_embedding_input_tokens,
             total_input_cost=generation_stats.total_input_cost,
             total_output_cost=generation_stats.total_output_cost,
+            total_embedding_cost=generation_stats.total_embedding_cost,
             llm_call_duration_secs=generation_stats.llm_call_duration_secs,
             fn_call_duration_secs=generation_stats.fn_call_duration_secs,
             total_llm_calls=generation_stats.total_llm_calls,

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/join.py RENAMED Viewed

@@ -376,8 +376,10 @@ class LLMJoin(JoinOp):
             join_condition=self.condition,
             total_input_tokens=generation_stats.total_input_tokens,
             total_output_tokens=generation_stats.total_output_tokens,
+            total_embedding_input_tokens=generation_stats.total_embedding_input_tokens,
             total_input_cost=generation_stats.total_input_cost,
             total_output_cost=generation_stats.total_output_cost,
+            total_embedding_cost=generation_stats.total_embedding_cost,
             llm_call_duration_secs=generation_stats.llm_call_duration_secs,
             fn_call_duration_secs=generation_stats.fn_call_duration_secs,
             total_llm_calls=generation_stats.total_llm_calls,
@@ -584,13 +586,13 @@ class EmbeddingJoin(LLMJoin):
             return np.zeros((0, 512)), GenerationStats()
         start_time = time.time()
-        total_input_tokens = 0
+        total_embedding_input_tokens = 0
         embeddings = None
         if self.text_only:
             client = OpenAI()
             inputs = [dr.to_json_str(bytes_to_str=True, project_cols=input_fields, sorted=True) for dr in candidates]
             response = client.embeddings.create(input=inputs, model=self.embedding_model.value)
-            total_input_tokens = response.usage.total_tokens
+            total_embedding_input_tokens = response.usage.total_tokens
             embeddings = np.array([item.embedding for item in response.data])
         else:
             model = self.locks.get_model(self.embedding_model.value)
@@ -616,14 +618,16 @@ class EmbeddingJoin(LLMJoin):
         # compute cost of embedding(s)
         model_card = MODEL_CARDS[self.embedding_model.value]
-        total_input_cost = model_card["usd_per_input_token"] * total_input_tokens
+        total_embedding_cost = model_card["usd_per_input_token"] * total_embedding_input_tokens
         embedding_gen_stats = GenerationStats(
             model_name=self.embedding_model.value,
-            total_input_tokens=total_input_tokens,
+            total_input_tokens=0.0,
             total_output_tokens=0.0,
-            total_input_cost=total_input_cost,
+            total_embedding_input_tokens=total_embedding_input_tokens,
+            total_input_cost=0.0,
             total_output_cost=0.0,
-            cost_per_record=total_input_cost,
+            total_embedding_cost=total_embedding_cost,
+            cost_per_record=total_embedding_cost,
             llm_call_duration_secs=time.time() - start_time,
             total_llm_calls=1,
             total_embedding_llm_calls=len(candidates),

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/rag.py RENAMED Viewed

@@ -109,15 +109,17 @@ class RAGConvert(LLMConvert):
         # compute the generation stats object
         model_card = MODEL_CARDS[model_name]
-        total_input_tokens = response.usage.total_tokens
-        total_input_cost = model_card["usd_per_input_token"] * total_input_tokens
+        total_embedding_input_tokens = response.usage.total_tokens
+        total_embedding_cost = model_card["usd_per_input_token"] * total_embedding_input_tokens
         embed_stats = GenerationStats(
             model_name=model_name,  # NOTE: this should be overwritten by generation model in convert()
-            total_input_tokens=total_input_tokens,
+            total_input_tokens=0.0,
             total_output_tokens=0.0,
-            total_input_cost=total_input_cost,
+            total_embedding_input_tokens=total_embedding_input_tokens,
+            total_input_cost=0.0,
             total_output_cost=0.0,
-            cost_per_record=total_input_cost,
+            total_embedding_cost=total_embedding_cost,
+            cost_per_record=total_embedding_cost,
             llm_call_duration_secs=total_time,
             total_llm_calls=1,
             total_embedding_llm_calls=1,
@@ -318,15 +320,17 @@ class RAGFilter(LLMFilter):
         # compute the generation stats object
         model_card = MODEL_CARDS[model_name]
-        total_input_tokens = response.usage.total_tokens
-        total_input_cost = model_card["usd_per_input_token"] * total_input_tokens
+        total_embedding_input_tokens = response.usage.total_tokens
+        total_embedding_cost = model_card["usd_per_input_token"] * total_embedding_input_tokens
         embed_stats = GenerationStats(
             model_name=model_name,  # NOTE: this should be overwritten by generation model in filter()
-            total_input_tokens=total_input_tokens,
+            total_input_tokens=0.0,
             total_output_tokens=0.0,
-            total_input_cost=total_input_cost,
+            total_embedding_input_tokens=total_embedding_input_tokens,
+            total_input_cost=0.0,
             total_output_cost=0.0,
-            cost_per_record=total_input_cost,
+            total_embedding_cost=total_embedding_cost,
+            cost_per_record=total_embedding_cost,
             llm_call_duration_secs=total_time,
             total_llm_calls=1,
             total_embedding_llm_calls=1,

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/cost_model.py RENAMED Viewed

@@ -105,9 +105,10 @@ class SampleBasedCostModel:
                     "time_per_record": record_op_stats.time_per_record,
                     "quality": record_op_stats.quality,
                     "passed_operator": record_op_stats.passed_operator,
-                    "source_indices": record_op_stats.record_source_indices,  # TODO: remove
-                    "op_details": record_op_stats.op_details,                 # TODO: remove
-                    "answer": record_op_stats.answer,                         # TODO: remove
+                    "source_indices": record_op_stats.record_source_indices,
+                    "op_details": record_op_stats.op_details,
+                    "answer": record_op_stats.answer,
+                    "op_name": record_op_stats.op_name,
                 }
                 execution_record_op_stats.append(record_op_stats_dict)
@@ -128,8 +129,12 @@ class SampleBasedCostModel:
                     else physical_op_df.source_indices.apply(tuple).nunique()
                 )
-                # compute selectivity
+                # compute selectivity; for filters this may be 1.0 on smalle samples;
+                # always put something slightly less than 1.0 to ensure that filters are pushed down when possible
                 selectivity = physical_op_df.passed_operator.sum() / num_source_records
+                op_name = physical_op_df.op_name.iloc[0].lower()
+                if selectivity == 1.0 and "filter" in op_name:
+                    selectivity -= 1e-3
                 # compute quality; if all qualities are None then this will be NaN
                 quality = physical_op_df.quality.mean()

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/processor/config.py RENAMED Viewed

@@ -27,7 +27,7 @@ class QueryProcessorConfig(BaseModel):
     join_parallelism: int = Field(default=64)
     batch_size: int | None = Field(default=None)
     reasoning_effort: str | None = Field(default=None)  # Gemini: "disable", "low", "medium", "high"
-    use_vertex: bool = Field(default=True)  # Whether to use Vertex models for Gemini or Google models
+    use_vertex: bool = Field(default=False)  # Whether to use Vertex models for Gemini or Google models
     gemini_credentials_path: str | None = Field(default=None)  # Path to Gemini credentials file
     api_base: str | None = Field(default=None)  # API base URL for vLLM

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/processor/query_processor_factory.py RENAMED Viewed

@@ -1,6 +1,9 @@
 import logging
+import os
 from enum import Enum
+from dotenv import load_dotenv
 from palimpzest.core.data.dataset import Dataset
 from palimpzest.core.elements.records import DataRecordCollection
 from palimpzest.query.execution.execution_strategy import ExecutionStrategy, SentinelExecutionStrategy
@@ -91,6 +94,27 @@ class QueryProcessorFactory:
         # set the final set of available models in the config
         config.available_models = available_models
+        if len(config.available_models) == 0:
+            raise ValueError("No available models found.")
+        openai_key = os.getenv("OPENAI_API_KEY")
+        anthropic_key = os.getenv("ANTHROPIC_API_KEY")
+        together_key = os.getenv("TOGETHER_API_KEY")
+        gemini_key = os.getenv("GEMINI_API_KEY")
+        google_key = os.getenv("GOOGLE_API_KEY")
+        for model in config.available_models:
+            if model.is_openai_model() and not openai_key:
+                raise ValueError("OPENAI_API_KEY must be set to use OpenAI models.")
+            if model.is_anthropic_model() and not anthropic_key:
+                raise ValueError("ANTHROPIC_API_KEY must be set to use Anthropic models.")
+            if model.is_together_model() and not together_key:
+                raise ValueError("TOGETHER_API_KEY must be set to use Together models.")
+            if model.is_google_ai_studio_model() and not (gemini_key or google_key or config.gemini_credentials_path):
+                raise ValueError("GEMINI_API_KEY, GOOGLE_API_KEY, or gemini_credentials path must be set to use Google Gemini models.")
+            if model.is_vllm_model() and config.api_base is None:
+                raise ValueError("api_base must be set to use vLLM models.")
         return config, validator
     @classmethod
@@ -172,6 +196,7 @@ class QueryProcessorFactory:
         train_dataset: dict[str, Dataset] | None = None,
         validator: Validator | None = None,
     ) -> DataRecordCollection:
+        load_dotenv(override=True)
         logger.info(f"Creating processor for dataset: {dataset}")
         processor = cls.create_processor(dataset, config, train_dataset, validator)
         logger.info(f"Created processor: {processor}")

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/utils/model_helpers.py RENAMED Viewed

@@ -3,13 +3,12 @@ import os
 from palimpzest.constants import Model
-# TODO: better handle vertex vs. google for gemini models
-def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_credentials_path: str | None = None, api_base: str | None = None) -> list[Model]:
+def get_models(include_embedding: bool = False, use_vertex: bool = False, gemini_credentials_path: str | None = None, api_base: str | None = None) -> list[Model]:
     """
     Return the set of models which the system has access to based on the set environment variables.
     """
     models = []
-    if os.getenv("OPENAI_API_KEY") is not None:
+    if os.getenv("OPENAI_API_KEY") not in [None, ""]:
         openai_models = [model for model in Model if model.is_openai_model()]
         if not include_embedding:
             openai_models = [
@@ -17,7 +16,7 @@ def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_
             ]
         models.extend(openai_models)
-    if os.getenv("TOGETHER_API_KEY") is not None:
+    if os.getenv("TOGETHER_API_KEY") not in [None, ""]:
         together_models = [model for model in Model if model.is_together_model()]
         if not include_embedding:
             together_models = [
@@ -25,7 +24,7 @@ def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_
             ]
         models.extend(together_models)
-    if os.getenv("ANTHROPIC_API_KEY") is not None:
+    if os.getenv("ANTHROPIC_API_KEY") not in [None, ""]:
         anthropic_models = [model for model in Model if model.is_anthropic_model()]
         if not include_embedding:
             anthropic_models = [
@@ -38,9 +37,9 @@ def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_
         if gemini_credentials_path is None
         else gemini_credentials_path
     )
-    if os.getenv("GEMINI_API_KEY") is not None or os.path.exists(gemini_credentials_path):
+    if os.getenv("GEMINI_API_KEY") not in [None, ""] or (use_vertex and os.path.exists(gemini_credentials_path)):
         vertex_models = [model for model in Model if model.is_vertex_model()]
-        google_models = [model for model in Model if model.is_google_model()]
+        google_ai_studio_models = [model for model in Model if model.is_google_ai_studio_model()]
         if not include_embedding:
             vertex_models = [
                 model for model in vertex_models if not model.is_embedding_model()
@@ -48,7 +47,7 @@ def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_
         if use_vertex:
             models.extend(vertex_models)
         else:
-            models.extend(google_models)
+            models.extend(google_ai_studio_models)
     if api_base is not None:
         vllm_models = [model for model in Model if model.is_vllm_model()]

{palimpzest-1.1.1 → palimpzest-1.3.0/src/palimpzest.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 1.1.1
+Version: 1.3.0
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org
@@ -31,9 +31,10 @@ Requires-Dist: pillow>=11.3.0
 Requires-Dist: prettytable>=3.9.0
 Requires-Dist: psutil==5.9.5
 Requires-Dist: PyLD>=2.0.4
-Requires-Dist: pyarrow==20.0.0
+Requires-Dist: pyarrow>=20.0.0
 Requires-Dist: pypdf>=5.1.0
 Requires-Dist: pytest-mock>=3.14.0
+Requires-Dist: python-dotenv>=1.2.1
 Requires-Dist: pyyaml>=6.0.1
 Requires-Dist: requests>=2.25
 Requires-Dist: ruff>=0.9.0

{palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest.egg-info/requires.txt RENAMED Viewed

@@ -14,9 +14,10 @@ pillow>=11.3.0
 prettytable>=3.9.0
 psutil==5.9.5
 PyLD>=2.0.4
-pyarrow==20.0.0
+pyarrow>=20.0.0
 pypdf>=5.1.0
 pytest-mock>=3.14.0
+python-dotenv>=1.2.1
 pyyaml>=6.0.1
 requests>=2.25
 ruff>=0.9.0