PyPI - palimpzest - Versions diffs - 0.8.0__tar.gz → 0.8.2__tar.gz - Mend

palimpzest 0.8.0tar.gz → 0.8.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

{palimpzest-0.8.0/src/palimpzest.egg-info → palimpzest-0.8.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 0.8.0
+Version: 0.8.2
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org

{palimpzest-0.8.0 → palimpzest-0.8.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "palimpzest"
-version = "0.8.0"
+version = "0.8.2"
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
 requires-python = ">=3.8"

{palimpzest-0.8.0 → palimpzest-0.8.2}/src/palimpzest/constants.py RENAMED Viewed

@@ -18,8 +18,12 @@ class Model(str, Enum):
     DEEPSEEK_R1_DISTILL_QWEN_1_5B = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
     GPT_4o = "openai/gpt-4o-2024-08-06"
     GPT_4o_MINI = "openai/gpt-4o-mini-2024-07-18"
-    GPT_5 = "openai/gpt-5"
-    GPT_5_MINI = "openai/gpt-5-mini"
+    GPT_4_1 = "openai/gpt-4.1-2025-04-14"
+    GPT_4_1_MINI = "openai/gpt-4.1-mini-2025-04-14"
+    GPT_4_1_NANO = "openai/gpt-4.1-nano-2025-04-14"
+    GPT_5 = "openai/gpt-5-2025-08-07"
+    GPT_5_MINI = "openai/gpt-5-mini-2025-08-07"
+    GPT_5_NANO = "openai/gpt-5-nano-2025-08-07"
     o4_MINI = "openai/o4-mini-2025-04-16"  # noqa: N815
     TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
     CLIP_VIT_B_32 = "clip-ViT-B-32"
@@ -29,6 +33,9 @@ class Model(str, Enum):
     GEMINI_2_0_FLASH = "vertex_ai/gemini-2.0-flash"
     GEMINI_2_5_FLASH = "vertex_ai/gemini-2.5-flash"
     GEMINI_2_5_PRO = "vertex_ai/gemini-2.5-pro"
+    GOOGLE_GEMINI_2_5_FLASH = "google/gemini-2.5-flash"
+    GOOGLE_GEMINI_2_5_FLASH_LITE = "google/gemini-2.5-flash-lite"
+    GOOGLE_GEMINI_2_5_PRO = "google/gemini-2.5-pro"
     LLAMA_4_MAVERICK = "vertex_ai/meta/llama-4-maverick-17b-128e-instruct-maas"
     GPT_4o_AUDIO_PREVIEW = "openai/gpt-4o-audio-preview"
     GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
@@ -54,7 +61,7 @@ class Model(str, Enum):
         return self in [Model.o4_MINI]
     def is_gpt_5_model(self):
-        return self in [Model.GPT_5, Model.GPT_5_MINI]
+        return self in [Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO]
     def is_openai_model(self):
         return "openai" in self.value.lower() or self.is_text_embedding_model()
@@ -65,13 +72,17 @@ class Model(str, Enum):
     def is_vertex_model(self):
         return "vertex_ai" in self.value.lower()
+    def is_google_model(self):
+        return "google" in self.value.lower()
     def is_vllm_model(self):
         return "hosted_vllm" in self.value.lower()
     def is_reasoning_model(self):
         reasoning_models = [
-            Model.GPT_5, Model.GPT_5_MINI, Model.o4_MINI,
+            Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO, Model.o4_MINI,
             Model.GEMINI_2_5_PRO, Model.GEMINI_2_5_FLASH,
+            Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
             Model.CLAUDE_3_7_SONNET,
         ]
         return self in reasoning_models
@@ -88,27 +99,31 @@ class Model(str, Enum):
     def is_vision_model(self):
         return self in [
             Model.LLAMA3_2_90B_V, Model.LLAMA_4_MAVERICK,
-            Model.GPT_4o, Model.GPT_4o_MINI, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI,
+            Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4_1, Model.GPT_4_1_MINI, Model.GPT_4_1_NANO, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO,
             Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
+            Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
         ]
     def is_audio_model(self):
         return self in [
             Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
             Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
+            Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
         ]
     def is_text_image_multimodal_model(self):
         return self in [
             Model.LLAMA_4_MAVERICK,
-            Model.GPT_4o, Model.GPT_4o_MINI, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI,
+            Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4_1, Model.GPT_4_1_MINI, Model.GPT_4_1_NANO, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO,
             Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
+            Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
         ]
     def is_text_audio_multimodal_model(self):
         return self in [
             Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
             Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
+            Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
         ]
     def is_embedding_model(self):
@@ -327,7 +342,7 @@ LLAMA3_2_3B_INSTRUCT_MODEL_CARD = {
     "usd_per_input_token": 0.06 / 1e6,
     "usd_per_output_token": 0.06 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0064,
+    "seconds_per_output_token": 0.0079,
     ##### Agg. Benchmark #####
     "overall": 36.50, # https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/discussions/13
 }
@@ -336,7 +351,7 @@ LLAMA3_1_8B_INSTRUCT_MODEL_CARD = {
     "usd_per_input_token": 0.18 / 1e6,
     "usd_per_output_token": 0.18 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0059,
+    "seconds_per_output_token": 0.0050,
     ##### Agg. Benchmark #####
     "overall": 44.25,
 }
@@ -345,7 +360,7 @@ LLAMA3_3_70B_INSTRUCT_MODEL_CARD = {
     "usd_per_input_token": 0.88 / 1e6,
     "usd_per_output_token": 0.88 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0139,
+    "seconds_per_output_token": 0.0122,
     ##### Agg. Benchmark #####
     "overall": 69.9,
 }
@@ -354,7 +369,7 @@ LLAMA3_2_90B_V_MODEL_CARD = {
     "usd_per_input_token": 1.2 / 1e6,
     "usd_per_output_token": 1.2 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0222,
+    "seconds_per_output_token": 0.0303,
     ##### Agg. Benchmark #####
     "overall": 65.00, # set to be slightly higher than gpt-4o-mini
 }
@@ -363,7 +378,7 @@ DEEPSEEK_V3_MODEL_CARD = {
     "usd_per_input_token": 1.25 / 1E6,
     "usd_per_output_token": 1.25 / 1E6,
     ##### Time #####
-    "seconds_per_output_token": 0.0769,
+    "seconds_per_output_token": 0.0114,
     ##### Agg. Benchmark #####
     "overall": 73.8,
 }
@@ -372,7 +387,7 @@ DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD = {
     "usd_per_input_token": 0.18 / 1E6,
     "usd_per_output_token": 0.18 / 1E6,
     ##### Time #####
-    "seconds_per_output_token": 0.0026,
+    "seconds_per_output_token": 0.0050, # NOTE: copied to be same as LLAMA3_1_8B_INSTRUCT_MODEL_CARD; need to update when we have data
     ##### Agg. Benchmark #####
     "overall": 39.90, # https://www.reddit.com/r/LocalLLaMA/comments/1iserf9/deepseek_r1_distilled_models_mmlu_pro_benchmarks/
 }
@@ -382,7 +397,7 @@ GPT_4o_AUDIO_PREVIEW_MODEL_CARD = {
     "usd_per_audio_input_token": 2.5 / 1e6,
     "usd_per_output_token": 10.0 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0079,
+    "seconds_per_output_token": 0.0080,
     ##### Agg. Benchmark #####
     "overall": 74.1,
 }
@@ -392,7 +407,7 @@ GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD = {
     "usd_per_audio_input_token": 0.15 / 1e6,
     "usd_per_output_token": 0.6 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0098,
+    "seconds_per_output_token": 0.0159,
     ##### Agg. Benchmark #####
     "overall": 62.7,
 }
@@ -402,7 +417,7 @@ GPT_4o_MODEL_CARD = {
     "usd_per_input_token": 2.5 / 1e6,
     "usd_per_output_token": 10.0 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0079,
+    "seconds_per_output_token": 0.0080,
     ##### Agg. Benchmark #####
     "overall": 74.1,
 }
@@ -412,17 +427,47 @@ GPT_4o_MINI_MODEL_CARD = {
     "usd_per_input_token": 0.15 / 1e6,
     "usd_per_output_token": 0.6 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0098,
+    "seconds_per_output_token": 0.0159,
     ##### Agg. Benchmark #####
     "overall": 62.7,
 }
+GPT_4_1_MODEL_CARD = {
+    # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
+    ##### Cost in USD #####
+    "usd_per_input_token": 2.0 / 1e6,
+    "usd_per_output_token": 8.0 / 1e6,
+    ##### Time #####
+    "seconds_per_output_token": 0.0076,
+    ##### Agg. Benchmark #####
+    "overall": 80.5,
+}
+GPT_4_1_MINI_MODEL_CARD = {
+    # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
+    ##### Cost in USD #####
+    "usd_per_input_token": 0.4 / 1e6,
+    "usd_per_output_token": 1.6 / 1e6,
+    ##### Time #####
+    "seconds_per_output_token": 0.0161,
+    ##### Agg. Benchmark #####
+    "overall": 77.2,
+}
+GPT_4_1_NANO_MODEL_CARD = {
+    # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
+    ##### Cost in USD #####
+    "usd_per_input_token": 0.1 / 1e6,
+    "usd_per_output_token": 0.4 / 1e6,
+    ##### Time #####
+    "seconds_per_output_token": 0.0060,
+    ##### Agg. Benchmark #####
+    "overall": 62.3,
+}
 GPT_5_MODEL_CARD = {
     # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
     ##### Cost in USD #####
     "usd_per_input_token": 1.25 / 1e6,
     "usd_per_output_token": 10.0 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0139,
+    "seconds_per_output_token": 0.0060,
     ##### Agg. Benchmark #####
     "overall": 87.00,
 }
@@ -432,30 +477,40 @@ GPT_5_MINI_MODEL_CARD = {
     "usd_per_input_token": 0.25 / 1e6,
     "usd_per_output_token": 2.0 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0094,
+    "seconds_per_output_token": 0.0135,
     ##### Agg. Benchmark #####
     "overall": 82.50,
 }
-o4_MINI_MODEL_CARD = {  # noqa: N816
+GPT_5_NANO_MODEL_CARD = {
     # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
     ##### Cost in USD #####
-    "usd_per_input_token": 1.1 / 1e6,
-    "usd_per_output_token": 4.4 / 1e6,
+    "usd_per_input_token": 0.05 / 1e6,
+    "usd_per_output_token": 0.4 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0093,
+    "seconds_per_output_token": 0.0055,
     ##### Agg. Benchmark #####
-    "overall": 80.6,  # using number reported for o3-mini; true number is likely higher
+    "overall": 77.9,
 }
-o1_MODEL_CARD = {  # noqa: N816
+o4_MINI_MODEL_CARD = {  # noqa: N816
     # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
     ##### Cost in USD #####
-    "usd_per_input_token": 15 / 1e6,
-    "usd_per_output_token": 60 / 1e6,
+    "usd_per_input_token": 1.1 / 1e6,
+    "usd_per_output_token": 4.4 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0110,
+    "seconds_per_output_token": 0.0092,
     ##### Agg. Benchmark #####
-    "overall": 83.50,
+    "overall": 80.6,  # using number reported for o3-mini; true number is likely higher
 }
+# o1_MODEL_CARD = {  # noqa: N816
+#     # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
+#     ##### Cost in USD #####
+#     "usd_per_input_token": 15 / 1e6,
+#     "usd_per_output_token": 60 / 1e6,
+#     ##### Time #####
+#     "seconds_per_output_token": 0.0110,
+#     ##### Agg. Benchmark #####
+#     "overall": 83.50,
+# }
 TEXT_EMBEDDING_3_SMALL_MODEL_CARD = {
     ##### Cost in USD #####
     "usd_per_input_token": 0.02 / 1e6,
@@ -479,7 +534,7 @@ CLAUDE_3_5_SONNET_MODEL_CARD = {
     "usd_per_input_token": 3.0 / 1e6,
     "usd_per_output_token": 15.0 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0127,
+    "seconds_per_output_token": 0.0154,
     ##### Agg. Benchmark #####
     "overall": 78.4,
 }
@@ -488,7 +543,7 @@ CLAUDE_3_7_SONNET_MODEL_CARD = {
     "usd_per_input_token": 3.0 / 1e6,
     "usd_per_output_token": 15.0 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0130,
+    "seconds_per_output_token": 0.0156,
     ##### Agg. Benchmark #####
     "overall": 80.7,
 }
@@ -497,7 +552,7 @@ CLAUDE_3_5_HAIKU_MODEL_CARD = {
     "usd_per_input_token": 0.8 / 1e6,
     "usd_per_output_token": 4.0 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0152,
+    "seconds_per_output_token": 0.0189,
     ##### Agg. Benchmark #####
     "overall": 64.1,
 }
@@ -507,17 +562,27 @@ GEMINI_2_0_FLASH_MODEL_CARD = {
     "usd_per_output_token": 0.6 / 1e6,
     "usd_per_audio_input_token": 1.0 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0049,
+    "seconds_per_output_token": 0.0054,
     ##### Agg. Benchmark #####
     "overall": 77.40,
 }
+GEMINI_2_5_FLASH_LITE_MODEL_CARD = {
+    ##### Cost in USD #####
+    "usd_per_input_token": 0.1 / 1e6,
+    "usd_per_output_token": 0.4 / 1e6,
+    "usd_per_audio_input_token": 0.3 / 1e6,
+    ##### Time #####
+    "seconds_per_output_token": 0.0034,
+    ##### Agg. Benchmark #####
+    "overall": 79.1, # NOTE: interpolated between gemini 2.5 flash and gemini 2.0 flash
+}
 GEMINI_2_5_FLASH_MODEL_CARD = {
     ##### Cost in USD #####
     "usd_per_input_token": 0.30 / 1e6,
     "usd_per_output_token": 2.5 / 1e6,
     "usd_per_audio_input_token": 1.0 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0039,
+    "seconds_per_output_token": 0.0044,
     ##### Agg. Benchmark #####
     "overall": 80.75, # NOTE: interpolated between gemini 2.0 flash and gemini 2.5 pro
 }
@@ -527,7 +592,7 @@ GEMINI_2_5_PRO_MODEL_CARD = {
     "usd_per_output_token": 10.0 / 1e6,
     "usd_per_audio_input_token": 1.25 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0070,
+    "seconds_per_output_token": 0.0072,
     ##### Agg. Benchmark #####
     "overall": 84.10,
 }
@@ -536,7 +601,7 @@ LLAMA_4_MAVERICK_MODEL_CARD = {
     "usd_per_input_token": 0.35 / 1e6,
     "usd_per_output_token": 1.15 / 1e6,
     ##### Time #####
-    "seconds_per_output_token": 0.0058,
+    "seconds_per_output_token": 0.0122,
     ##### Agg. Benchmark #####
     "overall": 79.4,
 }
@@ -561,8 +626,12 @@ MODEL_CARDS = {
     Model.GPT_4o_MINI.value: GPT_4o_MINI_MODEL_CARD,
     Model.GPT_4o_AUDIO_PREVIEW.value: GPT_4o_AUDIO_PREVIEW_MODEL_CARD,
     Model.GPT_4o_MINI_AUDIO_PREVIEW.value: GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD,
+    Model.GPT_4_1.value: GPT_4_1_MODEL_CARD,
+    Model.GPT_4_1_MINI.value: GPT_4_1_MINI_MODEL_CARD,
+    Model.GPT_4_1_NANO.value: GPT_4_1_NANO_MODEL_CARD,
     Model.GPT_5.value: GPT_5_MODEL_CARD,
     Model.GPT_5_MINI.value: GPT_5_MINI_MODEL_CARD,
+    Model.GPT_5_NANO.value: GPT_5_NANO_MODEL_CARD,
     Model.o4_MINI.value: o4_MINI_MODEL_CARD,
     # Model.o1.value: o1_MODEL_CARD,
     Model.TEXT_EMBEDDING_3_SMALL.value: TEXT_EMBEDDING_3_SMALL_MODEL_CARD,
@@ -573,6 +642,9 @@ MODEL_CARDS = {
     Model.GEMINI_2_0_FLASH.value: GEMINI_2_0_FLASH_MODEL_CARD,
     Model.GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
     Model.GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
+    Model.GOOGLE_GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
+    Model.GOOGLE_GEMINI_2_5_FLASH_LITE.value: GEMINI_2_5_FLASH_LITE_MODEL_CARD,
+    Model.GOOGLE_GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
     Model.LLAMA_4_MAVERICK.value: LLAMA_4_MAVERICK_MODEL_CARD,
     Model.VLLM_QWEN_1_5_0_5B_CHAT.value: VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD,
 }

{palimpzest-0.8.0 → palimpzest-0.8.2}/src/palimpzest/core/data/dataset.py RENAMED Viewed

@@ -228,7 +228,7 @@ class Dataset:
             id=self.id,
         )
-    def sem_join(self, other: Dataset, condition: str, depends_on: str | list[str] | None = None) -> Dataset:
+    def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
         """
         Perform a semantic (inner) join on the specified join predicate
         """
@@ -244,6 +244,7 @@ class Dataset:
             input_schema=combined_schema,
             output_schema=combined_schema,
             condition=condition,
+            desc=desc,
             depends_on=depends_on,
         )
@@ -277,6 +278,7 @@ class Dataset:
     def sem_filter(
         self,
         filter: str,
+        desc: str | None = None,
         depends_on: str | list[str] | None = None,
     ) -> Dataset:
         """Add a natural language description of a filter to the Set. This filter will possibly restrict the items that are returned later."""
@@ -292,12 +294,13 @@ class Dataset:
             depends_on = [depends_on]
         # construct logical operator
-        operator = FilteredScan(input_schema=self.schema, output_schema=self.schema, filter=f, depends_on=depends_on)
+        operator = FilteredScan(input_schema=self.schema, output_schema=self.schema, filter=f, desc=desc, depends_on=depends_on)
         return Dataset(sources=[self], operator=operator, schema=self.schema)
     def _sem_map(self, cols: list[dict] | type[BaseModel] | None,
                  cardinality: Cardinality,
+                 desc: str | None = None,
                  depends_on: str | list[str] | None = None) -> Dataset:
         """Execute the semantic map operation with the appropriate cardinality."""
         # construct new output schema
@@ -322,6 +325,7 @@ class Dataset:
             output_schema=new_output_schema,
             cardinality=cardinality,
             udf=None,
+            desc=desc,
             depends_on=depends_on,
         )
@@ -330,6 +334,7 @@ class Dataset:
     def sem_add_columns(self, cols: list[dict] | type[BaseModel],
                         cardinality: Cardinality = Cardinality.ONE_TO_ONE,
+                        desc: str | None = None,
                         depends_on: str | list[str] | None = None) -> Dataset:
         """
         NOTE: we are renaming this function to `sem_map` and deprecating `sem_add_columns` in the next
@@ -354,9 +359,9 @@ class Dataset:
             stacklevel=2
         )
-        return self._sem_map(cols, cardinality, depends_on)
+        return self._sem_map(cols, cardinality, desc, depends_on)
-    def sem_map(self, cols: list[dict] | type[BaseModel], depends_on: str | list[str] | None = None) -> Dataset:
+    def sem_map(self, cols: list[dict] | type[BaseModel], desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
         """
         Compute new field(s) by specifying their names, descriptions, and types. For each input there will
         be one output. The field(s) will be computed during the execution of the Dataset.
@@ -368,9 +373,9 @@ class Dataset:
                  {'name': 'full_name', 'desc': 'The name of the person', 'type': str}]
             )
         """
-        return self._sem_map(cols, Cardinality.ONE_TO_ONE, depends_on)
+        return self._sem_map(cols, Cardinality.ONE_TO_ONE, desc, depends_on)
-    def sem_flat_map(self, cols: list[dict] | type[BaseModel], depends_on: str | list[str] | None = None) -> Dataset:
+    def sem_flat_map(self, cols: list[dict] | type[BaseModel], desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
         """
         Compute new field(s) by specifying their names, descriptions, and types. For each input there will
         be one or more output(s). The field(s) will be computed during the execution of the Dataset.
@@ -384,7 +389,7 @@ class Dataset:
                 ]
             )
         """
-        return self._sem_map(cols, Cardinality.ONE_TO_MANY, depends_on)
+        return self._sem_map(cols, Cardinality.ONE_TO_MANY, desc, depends_on)
     def _map(self, udf: Callable,
             cols: list[dict] | type[BaseModel] | None,
@@ -590,7 +595,7 @@ class Dataset:
         return QueryProcessorFactory.create_and_run_processor(self, config)
-    def optimize_and_run(self, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, config: QueryProcessorConfig | None = None, **kwargs):
+    def optimize_and_run(self, config: QueryProcessorConfig | None = None, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, **kwargs):
         """Optimize the PZ program using the train_dataset and validator before running the optimized plan."""
         # TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR
         from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory

{palimpzest-0.8.0 → palimpzest-0.8.2}/src/palimpzest/prompts/convert_prompts.py RENAMED Viewed

@@ -53,7 +53,7 @@ ANSWER:
 COT_QA_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
 You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
 You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
+{desc_section}
 {output_format_instruction} Finish your response with a newline character followed by ---
 ---
 INPUT FIELDS:
@@ -72,7 +72,7 @@ REASONING: """
 COT_QA_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
 You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
 You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
+{desc_section}
 {output_format_instruction} Finish your response with a newline character followed by ---
 ---
 INPUT FIELDS:

{palimpzest-0.8.0 → palimpzest-0.8.2}/src/palimpzest/prompts/filter_prompts.py RENAMED Viewed

@@ -45,7 +45,7 @@ ANSWER: TRUE
 COT_BOOL_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
 You will be presented with a context and a filter condition. Output TRUE if the context satisfies the filter condition, and FALSE otherwise.
+{desc_section}
 Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
 ---
 INPUT FIELDS:
@@ -62,7 +62,7 @@ REASONING: """
 COT_BOOL_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
 You will be presented with a context and a filter condition. Output TRUE if the context satisfies the filter condition, and FALSE otherwise.
+{desc_section}
 Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
 ---
 INPUT FIELDS:

{palimpzest-0.8.0 → palimpzest-0.8.2}/src/palimpzest/prompts/join_prompts.py RENAMED Viewed

@@ -57,7 +57,7 @@ ANSWER: TRUE
 COT_JOIN_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
 You will be presented with two data records and a join condition. Output TRUE if the two data records satisfy the join condition, and FALSE otherwise.
+{desc_section}
 Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
 ---
 LEFT INPUT FIELDS:
@@ -80,7 +80,7 @@ REASONING: """
 COT_JOIN_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
 You will be presented with two data records and a join condition. Output TRUE if the two data records satisfy the join condition, and FALSE otherwise.
+{desc_section}
 Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
 ---
 LEFT INPUT FIELDS:

{palimpzest-0.8.0 → palimpzest-0.8.2}/src/palimpzest/prompts/moa_proposer_convert_prompts.py RENAMED Viewed

@@ -27,7 +27,7 @@ ANSWER: {example_answer}
 COT_MOA_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
 You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
 Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
+{desc_section}
 You will be provided with a description of each input field and each output field.
 ---
 INPUT FIELDS:

{palimpzest-0.8.0 → palimpzest-0.8.2}/src/palimpzest/prompts/prompt_factory.py RENAMED Viewed

@@ -138,6 +138,7 @@ from palimpzest.prompts.split_proposer_prompts import (
     SPLIT_PROPOSER_JOB_INSTRUCTION,
 )
 from palimpzest.prompts.util_phrases import (
+    DESC_SECTION,
     ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION,
     ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION,
 )
@@ -205,10 +206,11 @@ class PromptFactory:
         PromptStrategy.SPLIT_MERGER: COT_SPLIT_MERGER_BASE_USER_PROMPT,
     }
-    def __init__(self, prompt_strategy: PromptStrategy, model: Model, cardinality: Cardinality) -> None:
+    def __init__(self, prompt_strategy: PromptStrategy, model: Model, cardinality: Cardinality, desc: str | None = None) -> None:
         self.prompt_strategy = prompt_strategy
         self.model = model
         self.cardinality = cardinality
+        self.desc = desc
     def _get_context(self, candidate: DataRecord, input_fields: list[str]) -> str:
         """
@@ -446,6 +448,19 @@ class PromptFactory:
         }
         return prompt_strategy_to_job_instruction.get(self.prompt_strategy)
+    def _get_desc_section(self) -> str:
+        """
+        Returns the description section for the prompt.
+        Returns:
+            str: The description section (if applicable).
+        """
+        desc_section = ""
+        if self.desc is not None:
+            desc_section = DESC_SECTION.format(desc=self.desc)
+        return desc_section
     def _get_critique_criteria(self) -> str | None:
         """
         Returns the critique criteria for the critique operation.
@@ -758,6 +773,7 @@ class PromptFactory:
         prompt_strategy_format_kwargs = {
             "output_format_instruction": self._get_output_format_instruction(),
             "job_instruction": self._get_job_instruction(),
+            "desc_section": self._get_desc_section(),
             "critique_criteria": self._get_critique_criteria(),
             "refinement_criteria": self._get_refinement_criteria(),
             "finish_instruction": self._get_finish_instruction(),

{palimpzest-0.8.0 → palimpzest-0.8.2}/src/palimpzest/prompts/split_proposer_prompts.py RENAMED Viewed

@@ -27,7 +27,7 @@ ANSWER: {example_answer}
 COT_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
 You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
 Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
+{desc_section}
 You will be provided with a description of each input field and each output field.
 ---
 INPUT FIELDS:

{palimpzest-0.8.0 → palimpzest-0.8.2}/src/palimpzest/prompts/util_phrases.py RENAMED Viewed

@@ -12,3 +12,8 @@ REASONING: """
 COT_ANSWER_INSTRUCTION = """Let's think step-by-step in order to answer the question.
 ANSWER: """
+DESC_SECTION = """
+The user has additionally provided you with this description of the task you need to perform:
+{desc}
+"""

palimpzest 0.8.0__tar.gz → 0.8.2__tar.gz

palimpzest 0.8.0tar.gz → 0.8.2tar.gz