palimpzest 1.1.1__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. {palimpzest-1.1.1/src/palimpzest.egg-info → palimpzest-1.3.0}/PKG-INFO +3 -2
  2. {palimpzest-1.1.1 → palimpzest-1.3.0}/pyproject.toml +3 -2
  3. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/constants.py +5 -5
  4. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/models.py +71 -1
  5. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/execution/mab_execution_strategy.py +1 -1
  6. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/convert.py +2 -0
  7. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/filter.py +2 -0
  8. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/join.py +10 -6
  9. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/rag.py +14 -10
  10. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/cost_model.py +9 -4
  11. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/processor/config.py +1 -1
  12. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/processor/query_processor_factory.py +25 -0
  13. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/utils/model_helpers.py +7 -8
  14. {palimpzest-1.1.1 → palimpzest-1.3.0/src/palimpzest.egg-info}/PKG-INFO +3 -2
  15. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest.egg-info/requires.txt +2 -1
  16. {palimpzest-1.1.1 → palimpzest-1.3.0}/LICENSE +0 -0
  17. {palimpzest-1.1.1 → palimpzest-1.3.0}/README.md +0 -0
  18. {palimpzest-1.1.1 → palimpzest-1.3.0}/setup.cfg +0 -0
  19. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/__init__.py +0 -0
  20. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/agents/__init__.py +0 -0
  21. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/agents/compute_agents.py +0 -0
  22. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/agents/search_agents.py +0 -0
  23. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/__init__.py +0 -0
  24. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/data/__init__.py +0 -0
  25. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/data/context.py +0 -0
  26. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/data/context_manager.py +0 -0
  27. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/data/dataset.py +0 -0
  28. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/data/index_dataset.py +0 -0
  29. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/data/iter_dataset.py +0 -0
  30. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/elements/__init__.py +0 -0
  31. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/elements/filters.py +0 -0
  32. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/elements/groupbysig.py +0 -0
  33. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/elements/records.py +0 -0
  34. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/lib/__init__.py +0 -0
  35. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/core/lib/schemas.py +0 -0
  36. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/policy.py +0 -0
  37. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/__init__.py +0 -0
  38. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/agent_prompts.py +0 -0
  39. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/aggregate_prompts.py +0 -0
  40. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/context_search.py +0 -0
  41. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/convert_prompts.py +0 -0
  42. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/critique_and_refine_prompts.py +0 -0
  43. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/filter_prompts.py +0 -0
  44. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/join_prompts.py +0 -0
  45. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/moa_aggregator_prompts.py +0 -0
  46. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/moa_proposer_prompts.py +0 -0
  47. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/prompt_factory.py +0 -0
  48. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/split_merge_prompts.py +0 -0
  49. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/split_proposer_prompts.py +0 -0
  50. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/utils.py +0 -0
  51. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/prompts/validator.py +0 -0
  52. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/__init__.py +0 -0
  53. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/execution/__init__.py +0 -0
  54. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/execution/all_sample_execution_strategy.py +0 -0
  55. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/execution/execution_strategy.py +0 -0
  56. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/execution/execution_strategy_type.py +0 -0
  57. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/execution/parallel_execution_strategy.py +0 -0
  58. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/execution/single_threaded_execution_strategy.py +0 -0
  59. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/generators/__init__.py +0 -0
  60. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/generators/generators.py +0 -0
  61. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/__init__.py +0 -0
  62. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/aggregate.py +0 -0
  63. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/compute.py +0 -0
  64. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/critique_and_refine.py +0 -0
  65. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/distinct.py +0 -0
  66. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/limit.py +0 -0
  67. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/logical.py +0 -0
  68. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/mixture_of_agents.py +0 -0
  69. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/physical.py +0 -0
  70. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/project.py +0 -0
  71. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/scan.py +0 -0
  72. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/search.py +0 -0
  73. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/split.py +0 -0
  74. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/operators/topk.py +0 -0
  75. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/__init__.py +0 -0
  76. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/optimizer.py +0 -0
  77. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/optimizer_strategy.py +0 -0
  78. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/optimizer_strategy_type.py +0 -0
  79. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/plan.py +0 -0
  80. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/primitives.py +0 -0
  81. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/rules.py +0 -0
  82. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/optimizer/tasks.py +0 -0
  83. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/processor/__init__.py +0 -0
  84. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/query/processor/query_processor.py +0 -0
  85. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/schemabuilder/__init__.py +0 -0
  86. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/schemabuilder/schema_builder.py +0 -0
  87. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/tools/README.md +0 -0
  88. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/tools/__init__.py +0 -0
  89. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/tools/allenpdf.py +0 -0
  90. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/tools/pdfparser.py +0 -0
  91. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/tools/skema_tools.py +0 -0
  92. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/utils/__init__.py +0 -0
  93. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/utils/env_helpers.py +0 -0
  94. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/utils/hash_helpers.py +0 -0
  95. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/utils/progress.py +0 -0
  96. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/utils/udfs.py +0 -0
  97. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/validator/__init__.py +0 -0
  98. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest/validator/validator.py +0 -0
  99. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest.egg-info/SOURCES.txt +0 -0
  100. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest.egg-info/dependency_links.txt +0 -0
  101. {palimpzest-1.1.1 → palimpzest-1.3.0}/src/palimpzest.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 1.1.1
3
+ Version: 1.3.0
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -31,9 +31,10 @@ Requires-Dist: pillow>=11.3.0
31
31
  Requires-Dist: prettytable>=3.9.0
32
32
  Requires-Dist: psutil==5.9.5
33
33
  Requires-Dist: PyLD>=2.0.4
34
- Requires-Dist: pyarrow==20.0.0
34
+ Requires-Dist: pyarrow>=20.0.0
35
35
  Requires-Dist: pypdf>=5.1.0
36
36
  Requires-Dist: pytest-mock>=3.14.0
37
+ Requires-Dist: python-dotenv>=1.2.1
37
38
  Requires-Dist: pyyaml>=6.0.1
38
39
  Requires-Dist: requests>=2.25
39
40
  Requires-Dist: ruff>=0.9.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "palimpzest"
3
- version = "1.1.1"
3
+ version = "1.3.0"
4
4
  description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -25,9 +25,10 @@ dependencies = [
25
25
  "prettytable>=3.9.0",
26
26
  "psutil==5.9.5",
27
27
  "PyLD>=2.0.4",
28
- "pyarrow==20.0.0",
28
+ "pyarrow>=20.0.0",
29
29
  "pypdf>=5.1.0",
30
30
  "pytest-mock>=3.14.0",
31
+ "python-dotenv>=1.2.1",
31
32
  "pyyaml>=6.0.1",
32
33
  "requests>=2.25",
33
34
  "ruff>=0.9.0",
@@ -31,9 +31,9 @@ class Model(str, Enum):
31
31
  GEMINI_2_0_FLASH = "vertex_ai/gemini-2.0-flash"
32
32
  GEMINI_2_5_FLASH = "vertex_ai/gemini-2.5-flash"
33
33
  GEMINI_2_5_PRO = "vertex_ai/gemini-2.5-pro"
34
- GOOGLE_GEMINI_2_5_FLASH = "google/gemini-2.5-flash"
35
- GOOGLE_GEMINI_2_5_FLASH_LITE = "google/gemini-2.5-flash-lite"
36
- GOOGLE_GEMINI_2_5_PRO = "google/gemini-2.5-pro"
34
+ GOOGLE_GEMINI_2_5_FLASH = "gemini/gemini-2.5-flash"
35
+ GOOGLE_GEMINI_2_5_FLASH_LITE = "gemini/gemini-2.5-flash-lite"
36
+ GOOGLE_GEMINI_2_5_PRO = "gemini/gemini-2.5-pro"
37
37
  LLAMA_4_MAVERICK = "vertex_ai/meta/llama-4-maverick-17b-128e-instruct-maas"
38
38
  GPT_4o_AUDIO_PREVIEW = "openai/gpt-4o-audio-preview"
39
39
  GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
@@ -72,8 +72,8 @@ class Model(str, Enum):
72
72
  def is_vertex_model(self):
73
73
  return "vertex_ai" in self.value.lower()
74
74
 
75
- def is_google_model(self):
76
- return "google" in self.value.lower()
75
+ def is_google_ai_studio_model(self):
76
+ return "gemini/" in self.value.lower()
77
77
 
78
78
  def is_vllm_model(self):
79
79
  return "hosted_vllm" in self.value.lower()
@@ -35,12 +35,18 @@ class GenerationStats(BaseModel):
35
35
  # typed as a float because GenerationStats may be amortized (i.e. divided) across a number of output records
36
36
  total_output_tokens: float = 0.0
37
37
 
38
+ # the total number of input tokens processed by embedding models
39
+ total_embedding_input_tokens: float = 0.0
40
+
38
41
  # the total cost of processing the input tokens; None if this operation did not use an LLM
39
42
  total_input_cost: float = 0.0
40
43
 
41
44
  # the total cost of processing the output tokens; None if this operation did not use an LLM
42
45
  total_output_cost: float = 0.0
43
46
 
47
+ # the total cost of processing input tokens for embedding models
48
+ total_embedding_cost: float = 0.0
49
+
44
50
  # the total cost of processing the input and output tokens; None if this operation did not use an LLM
45
51
  cost_per_record: float = 0.0
46
52
 
@@ -68,6 +74,9 @@ class GenerationStats(BaseModel):
68
74
  "fn_call_duration_secs",
69
75
  "total_llm_calls",
70
76
  "total_embedding_llm_calls",
77
+ "total_embedding_input_tokens",
78
+ "total_embedding_cost"
79
+
71
80
  ]:
72
81
  setattr(self, model_field, getattr(self, model_field) + getattr(other, model_field))
73
82
  return self
@@ -85,6 +94,8 @@ class GenerationStats(BaseModel):
85
94
  "cost_per_record",
86
95
  "total_llm_calls",
87
96
  "total_embedding_llm_calls",
97
+ "total_embedding_input_tokens",
98
+ "total_embedding_cost"
88
99
  ]
89
100
  }
90
101
  # dct['raw_answers'] = self.raw_answers + other.raw_answers
@@ -107,6 +118,8 @@ class GenerationStats(BaseModel):
107
118
  "fn_call_duration_secs",
108
119
  "total_llm_calls",
109
120
  "total_embedding_llm_calls",
121
+ "total_embedding_input_tokens",
122
+ "total_embedding_cost"
110
123
  ]:
111
124
  setattr(self, model_field, getattr(self, model_field) / quotient)
112
125
  return self
@@ -128,6 +141,8 @@ class GenerationStats(BaseModel):
128
141
  "total_llm_calls",
129
142
  "total_embedding_llm_calls",
130
143
  "cost_per_record",
144
+ "total_embedding_input_tokens",
145
+ "total_embedding_cost"
131
146
  ]
132
147
  }
133
148
  dct["model_name"] = self.model_name
@@ -217,6 +232,10 @@ class RecordOpStats(BaseModel):
217
232
  # typed as a float because GenerationStats may be amortized (i.e. divided) across a number of output records
218
233
  total_output_tokens: float = 0.0
219
234
 
235
+ # the total number of input tokens processed by embedding models
236
+ # typed as a float because GenerationStats may be amortized (i.e. divided) across a number of output records
237
+ total_embedding_input_tokens: float = 0.0
238
+
220
239
  # the total cost of processing the input tokens; None if this operation did not use an LLM
221
240
  total_input_cost: float = 0.0
222
241
 
@@ -278,6 +297,9 @@ class OperatorStats(BaseModel):
278
297
  # the total output tokens processed by this operation
279
298
  total_output_tokens: int = 0
280
299
 
300
+ #the total embedding input tokens processed by this operation
301
+ total_embedding_input_tokens: int = 0
302
+
281
303
  # a list of RecordOpStats processed by the operation
282
304
  record_op_stats_lst: list[RecordOpStats] = Field(default_factory=list)
283
305
 
@@ -309,6 +331,7 @@ class OperatorStats(BaseModel):
309
331
  self.total_op_cost += stats.total_op_cost
310
332
  self.total_input_tokens += stats.total_input_tokens
311
333
  self.total_output_tokens += stats.total_output_tokens
334
+ self.total_embedding_input_tokens += stats.total_embedding_input_tokens
312
335
  self.record_op_stats_lst.extend(stats.record_op_stats_lst)
313
336
 
314
337
  elif isinstance(stats, RecordOpStats):
@@ -319,6 +342,7 @@ class OperatorStats(BaseModel):
319
342
  self.total_op_cost += stats.cost_per_record
320
343
  self.total_input_tokens += stats.total_input_tokens
321
344
  self.total_output_tokens += stats.total_output_tokens
345
+ self.total_embedding_input_tokens += stats.total_embedding_input_tokens
322
346
 
323
347
  else:
324
348
  raise TypeError(f"Cannot add {type(stats)} to OperatorStats")
@@ -370,6 +394,9 @@ class BasePlanStats(BaseModel):
370
394
  # total output tokens processed by this plan
371
395
  total_output_tokens: int = 0
372
396
 
397
+ # total embedding input tokens processed by this plan
398
+ total_embedding_input_tokens: int = 0
399
+
373
400
  # start time for the plan execution; should be set by calling PlanStats.start()
374
401
  start_time: float | None = None
375
402
 
@@ -385,6 +412,7 @@ class BasePlanStats(BaseModel):
385
412
  self.total_plan_cost = self.sum_op_costs() + self.sum_validation_costs()
386
413
  self.total_input_tokens = self.sum_input_tokens() + self.sum_validation_input_tokens()
387
414
  self.total_output_tokens = self.sum_output_tokens() + self.sum_validation_output_tokens()
415
+ self.total_embedding_input_tokens = self.sum_embedding_input_tokens() + self.sum_validation_embedding_input_tokens()
388
416
 
389
417
  @staticmethod
390
418
  @abstractmethod
@@ -415,6 +443,13 @@ class BasePlanStats(BaseModel):
415
443
  """
416
444
  pass
417
445
 
446
+ @abstractmethod
447
+ def sum_embedding_input_tokens(self) -> int:
448
+ """
449
+ Sum the input embedding tokens processed by all operators in this plan.
450
+ """
451
+ pass
452
+
418
453
  @abstractmethod
419
454
  def add_record_op_stats(self, unique_full_op_id: str, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
420
455
  """
@@ -453,6 +488,12 @@ class BasePlanStats(BaseModel):
453
488
  Sum the output tokens processed by all validation generations in this plan.
454
489
  """
455
490
  return sum([gen_stats.total_output_tokens for _, gen_stats in self.validation_gen_stats.items()])
491
+
492
+ def sum_validation_embedding_input_tokens(self) -> int:
493
+ """
494
+ Sum the input embedding tokens processed by all validation generations in this plan.
495
+ """
496
+ return sum([gen_stats.total_embedding_input_tokens for _, gen_stats in self.validation_gen_stats.items()])
456
497
 
457
498
  def get_total_cost_so_far(self) -> float:
458
499
  """
@@ -501,6 +542,12 @@ class PlanStats(BasePlanStats):
501
542
  Sum the output tokens processed by all operators in this plan.
502
543
  """
503
544
  return sum([op_stats.total_output_tokens for _, op_stats in self.operator_stats.items()])
545
+
546
+ def sum_embedding_input_tokens(self) -> int:
547
+ """
548
+ Sum the input embedding tokens processed by all operators in this plan.
549
+ """
550
+ return sum([op_stats.total_embedding_input_tokens for _, op_stats in self.operator_stats.items()])
504
551
 
505
552
  def add_record_op_stats(self, unique_full_op_id: str, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
506
553
  """
@@ -528,6 +575,7 @@ class PlanStats(BasePlanStats):
528
575
  self.total_plan_cost += plan_stats.total_plan_cost
529
576
  self.total_input_tokens += plan_stats.total_input_tokens
530
577
  self.total_output_tokens += plan_stats.total_output_tokens
578
+ self.total_embedding_input_tokens += plan_stats.total_embedding_input_tokens
531
579
  for unique_full_op_id, op_stats in plan_stats.operator_stats.items():
532
580
  if unique_full_op_id in self.operator_stats:
533
581
  self.operator_stats[unique_full_op_id] += op_stats
@@ -539,6 +587,7 @@ class PlanStats(BasePlanStats):
539
587
  stats += f"total_plan_cost={self.total_plan_cost} \n"
540
588
  stats += f"total_input_tokens={self.total_input_tokens} \n"
541
589
  stats += f"total_output_tokens={self.total_output_tokens} \n"
590
+ stats += f"total_embedding_input_tokens={self.total_embedding_input_tokens} \n"
542
591
  for idx, op_stats in enumerate(self.operator_stats.values()):
543
592
  stats += f"{idx}. {op_stats.op_name} time={op_stats.total_op_time} cost={op_stats.total_op_cost} \n"
544
593
  return stats
@@ -586,6 +635,12 @@ class SentinelPlanStats(BasePlanStats):
586
635
  Sum the output tokens processed by all operators in this plan.
587
636
  """
588
637
  return sum(sum([op_stats.total_output_tokens for _, op_stats in phys_op_stats.items()]) for _, phys_op_stats in self.operator_stats.items())
638
+
639
+ def sum_embedding_input_tokens(self) -> int:
640
+ """
641
+ Sum the output tokens processed by all operators in this plan.
642
+ """
643
+ return sum(sum([op_stats.total_embedding_input_tokens for _, op_stats in phys_op_stats.items()]) for _, phys_op_stats in self.operator_stats.items())
589
644
 
590
645
  def add_record_op_stats(self, unique_logical_op_id: str, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
591
646
  """
@@ -627,6 +682,7 @@ class SentinelPlanStats(BasePlanStats):
627
682
  self.total_plan_cost += plan_stats.total_plan_cost
628
683
  self.total_input_tokens += plan_stats.total_input_tokens
629
684
  self.total_output_tokens += plan_stats.total_output_tokens
685
+ self.total_embedding_input_tokens += plan_stats.total_embedding_input_tokens
630
686
  for unique_logical_op_id, physical_op_stats in plan_stats.operator_stats.items():
631
687
  for full_op_id, op_stats in physical_op_stats.items():
632
688
  if unique_logical_op_id in self.operator_stats:
@@ -648,6 +704,7 @@ class SentinelPlanStats(BasePlanStats):
648
704
  stats += f"total_plan_cost={self.total_plan_cost} \n"
649
705
  stats += f"total_input_tokens={self.total_input_tokens} \n"
650
706
  stats += f"total_output_tokens={self.total_output_tokens} \n"
707
+ stats += f"total_embedding_input_tokens={self.total_embedding_input_tokens} \n"
651
708
  for outer_idx, physical_op_stats in enumerate(self.operator_stats.values()):
652
709
  total_time = sum([op_stats.total_op_time for op_stats in physical_op_stats.values()])
653
710
  total_cost = sum([op_stats.total_op_cost for op_stats in physical_op_stats.values()])
@@ -695,6 +752,9 @@ class ExecutionStats(BaseModel):
695
752
  # total number of output tokens processed
696
753
  total_output_tokens: int = 0
697
754
 
755
+ # total number of embedding input tokens processed
756
+ total_embedding_input_tokens: int = 0
757
+
698
758
  # total number of tokens processed
699
759
  total_tokens: int = 0
700
760
 
@@ -748,7 +808,8 @@ class ExecutionStats(BaseModel):
748
808
  # compute the tokens for total execution
749
809
  self.total_input_tokens = self.sum_input_tokens()
750
810
  self.total_output_tokens = self.sum_output_tokens()
751
- self.total_tokens = self.total_input_tokens + self.total_output_tokens
811
+ self.total_embedding_input_tokens = self.sum_embedding_input_tokens()
812
+ self.total_tokens = self.total_input_tokens + self.total_output_tokens + self.total_embedding_input_tokens
752
813
 
753
814
  # compute plan_strs
754
815
  self.plan_strs = {plan_id: plan_stats.plan_str for plan_id, plan_stats in self.plan_stats.items()}
@@ -780,6 +841,15 @@ class ExecutionStats(BaseModel):
780
841
  sentinel_plan_output_tokens = sum([plan_stats.sum_output_tokens() for _, plan_stats in self.sentinel_plan_stats.items()])
781
842
  plan_output_tokens = sum([plan_stats.sum_output_tokens() for _, plan_stats in self.plan_stats.items()])
782
843
  return plan_output_tokens + sentinel_plan_output_tokens
844
+
845
+
846
+ def sum_embedding_input_tokens(self) -> int:
847
+ """
848
+ Sum the embedding input tokens processed in this execution
849
+ """
850
+ sentinel_plan_embedding_input_tokens = sum([plan_stats.sum_embedding_input_tokens() for _, plan_stats in self.sentinel_plan_stats.items()])
851
+ plan_embedding_input_tokens = sum([plan_stats.sum_embedding_input_tokens() for _, plan_stats in self.plan_stats.items()])
852
+ return plan_embedding_input_tokens + sentinel_plan_embedding_input_tokens
783
853
 
784
854
  def add_plan_stats(self, plan_stats: PlanStats | SentinelPlanStats | list[PlanStats] | list[SentinelPlanStats]) -> None:
785
855
  """
@@ -777,7 +777,7 @@ class MABExecutionStrategy(SentinelExecutionStrategy):
777
777
 
778
778
  # if the operator is a non-llm filter which has filtered out records, remove those records from
779
779
  # all downstream operators' full_op_id_to_sources_not_processed
780
- if isinstance(op_set[0], NonLLMFilter):
780
+ if isinstance(op_set[0], NonLLMFilter) and next_unique_logical_op_id is not None:
781
781
  self._remove_filtered_records_from_downstream_ops(topo_idx, plan, op_frontiers, source_indices_to_all_record_sets)
782
782
 
783
783
  # finalize plan stats
@@ -121,8 +121,10 @@ class ConvertOp(PhysicalOperator, ABC):
121
121
  generated_fields=field_names,
122
122
  total_input_tokens=per_record_stats.total_input_tokens,
123
123
  total_output_tokens=per_record_stats.total_output_tokens,
124
+ total_embedding_input_tokens=per_record_stats.total_embedding_input_tokens,
124
125
  total_input_cost=per_record_stats.total_input_cost,
125
126
  total_output_cost=per_record_stats.total_output_cost,
127
+ total_embedding_cost=per_record_stats.total_embedding_cost,
126
128
  llm_call_duration_secs=per_record_stats.llm_call_duration_secs,
127
129
  fn_call_duration_secs=per_record_stats.fn_call_duration_secs,
128
130
  total_llm_calls=per_record_stats.total_llm_calls,
@@ -89,8 +89,10 @@ class FilterOp(PhysicalOperator, ABC):
89
89
  filter_str=self.filter_obj.get_filter_str(),
90
90
  total_input_tokens=generation_stats.total_input_tokens,
91
91
  total_output_tokens=generation_stats.total_output_tokens,
92
+ total_embedding_input_tokens=generation_stats.total_embedding_input_tokens,
92
93
  total_input_cost=generation_stats.total_input_cost,
93
94
  total_output_cost=generation_stats.total_output_cost,
95
+ total_embedding_cost=generation_stats.total_embedding_cost,
94
96
  llm_call_duration_secs=generation_stats.llm_call_duration_secs,
95
97
  fn_call_duration_secs=generation_stats.fn_call_duration_secs,
96
98
  total_llm_calls=generation_stats.total_llm_calls,
@@ -376,8 +376,10 @@ class LLMJoin(JoinOp):
376
376
  join_condition=self.condition,
377
377
  total_input_tokens=generation_stats.total_input_tokens,
378
378
  total_output_tokens=generation_stats.total_output_tokens,
379
+ total_embedding_input_tokens=generation_stats.total_embedding_input_tokens,
379
380
  total_input_cost=generation_stats.total_input_cost,
380
381
  total_output_cost=generation_stats.total_output_cost,
382
+ total_embedding_cost=generation_stats.total_embedding_cost,
381
383
  llm_call_duration_secs=generation_stats.llm_call_duration_secs,
382
384
  fn_call_duration_secs=generation_stats.fn_call_duration_secs,
383
385
  total_llm_calls=generation_stats.total_llm_calls,
@@ -584,13 +586,13 @@ class EmbeddingJoin(LLMJoin):
584
586
  return np.zeros((0, 512)), GenerationStats()
585
587
 
586
588
  start_time = time.time()
587
- total_input_tokens = 0
589
+ total_embedding_input_tokens = 0
588
590
  embeddings = None
589
591
  if self.text_only:
590
592
  client = OpenAI()
591
593
  inputs = [dr.to_json_str(bytes_to_str=True, project_cols=input_fields, sorted=True) for dr in candidates]
592
594
  response = client.embeddings.create(input=inputs, model=self.embedding_model.value)
593
- total_input_tokens = response.usage.total_tokens
595
+ total_embedding_input_tokens = response.usage.total_tokens
594
596
  embeddings = np.array([item.embedding for item in response.data])
595
597
  else:
596
598
  model = self.locks.get_model(self.embedding_model.value)
@@ -616,14 +618,16 @@ class EmbeddingJoin(LLMJoin):
616
618
 
617
619
  # compute cost of embedding(s)
618
620
  model_card = MODEL_CARDS[self.embedding_model.value]
619
- total_input_cost = model_card["usd_per_input_token"] * total_input_tokens
621
+ total_embedding_cost = model_card["usd_per_input_token"] * total_embedding_input_tokens
620
622
  embedding_gen_stats = GenerationStats(
621
623
  model_name=self.embedding_model.value,
622
- total_input_tokens=total_input_tokens,
624
+ total_input_tokens=0.0,
623
625
  total_output_tokens=0.0,
624
- total_input_cost=total_input_cost,
626
+ total_embedding_input_tokens=total_embedding_input_tokens,
627
+ total_input_cost=0.0,
625
628
  total_output_cost=0.0,
626
- cost_per_record=total_input_cost,
629
+ total_embedding_cost=total_embedding_cost,
630
+ cost_per_record=total_embedding_cost,
627
631
  llm_call_duration_secs=time.time() - start_time,
628
632
  total_llm_calls=1,
629
633
  total_embedding_llm_calls=len(candidates),
@@ -109,15 +109,17 @@ class RAGConvert(LLMConvert):
109
109
 
110
110
  # compute the generation stats object
111
111
  model_card = MODEL_CARDS[model_name]
112
- total_input_tokens = response.usage.total_tokens
113
- total_input_cost = model_card["usd_per_input_token"] * total_input_tokens
112
+ total_embedding_input_tokens = response.usage.total_tokens
113
+ total_embedding_cost = model_card["usd_per_input_token"] * total_embedding_input_tokens
114
114
  embed_stats = GenerationStats(
115
115
  model_name=model_name, # NOTE: this should be overwritten by generation model in convert()
116
- total_input_tokens=total_input_tokens,
116
+ total_input_tokens=0.0,
117
117
  total_output_tokens=0.0,
118
- total_input_cost=total_input_cost,
118
+ total_embedding_input_tokens=total_embedding_input_tokens,
119
+ total_input_cost=0.0,
119
120
  total_output_cost=0.0,
120
- cost_per_record=total_input_cost,
121
+ total_embedding_cost=total_embedding_cost,
122
+ cost_per_record=total_embedding_cost,
121
123
  llm_call_duration_secs=total_time,
122
124
  total_llm_calls=1,
123
125
  total_embedding_llm_calls=1,
@@ -318,15 +320,17 @@ class RAGFilter(LLMFilter):
318
320
 
319
321
  # compute the generation stats object
320
322
  model_card = MODEL_CARDS[model_name]
321
- total_input_tokens = response.usage.total_tokens
322
- total_input_cost = model_card["usd_per_input_token"] * total_input_tokens
323
+ total_embedding_input_tokens = response.usage.total_tokens
324
+ total_embedding_cost = model_card["usd_per_input_token"] * total_embedding_input_tokens
323
325
  embed_stats = GenerationStats(
324
326
  model_name=model_name, # NOTE: this should be overwritten by generation model in filter()
325
- total_input_tokens=total_input_tokens,
327
+ total_input_tokens=0.0,
326
328
  total_output_tokens=0.0,
327
- total_input_cost=total_input_cost,
329
+ total_embedding_input_tokens=total_embedding_input_tokens,
330
+ total_input_cost=0.0,
328
331
  total_output_cost=0.0,
329
- cost_per_record=total_input_cost,
332
+ total_embedding_cost=total_embedding_cost,
333
+ cost_per_record=total_embedding_cost,
330
334
  llm_call_duration_secs=total_time,
331
335
  total_llm_calls=1,
332
336
  total_embedding_llm_calls=1,
@@ -105,9 +105,10 @@ class SampleBasedCostModel:
105
105
  "time_per_record": record_op_stats.time_per_record,
106
106
  "quality": record_op_stats.quality,
107
107
  "passed_operator": record_op_stats.passed_operator,
108
- "source_indices": record_op_stats.record_source_indices, # TODO: remove
109
- "op_details": record_op_stats.op_details, # TODO: remove
110
- "answer": record_op_stats.answer, # TODO: remove
108
+ "source_indices": record_op_stats.record_source_indices,
109
+ "op_details": record_op_stats.op_details,
110
+ "answer": record_op_stats.answer,
111
+ "op_name": record_op_stats.op_name,
111
112
  }
112
113
  execution_record_op_stats.append(record_op_stats_dict)
113
114
 
@@ -128,8 +129,12 @@ class SampleBasedCostModel:
128
129
  else physical_op_df.source_indices.apply(tuple).nunique()
129
130
  )
130
131
 
131
- # compute selectivity
132
+ # compute selectivity; for filters this may be 1.0 on smalle samples;
133
+ # always put something slightly less than 1.0 to ensure that filters are pushed down when possible
132
134
  selectivity = physical_op_df.passed_operator.sum() / num_source_records
135
+ op_name = physical_op_df.op_name.iloc[0].lower()
136
+ if selectivity == 1.0 and "filter" in op_name:
137
+ selectivity -= 1e-3
133
138
 
134
139
  # compute quality; if all qualities are None then this will be NaN
135
140
  quality = physical_op_df.quality.mean()
@@ -27,7 +27,7 @@ class QueryProcessorConfig(BaseModel):
27
27
  join_parallelism: int = Field(default=64)
28
28
  batch_size: int | None = Field(default=None)
29
29
  reasoning_effort: str | None = Field(default=None) # Gemini: "disable", "low", "medium", "high"
30
- use_vertex: bool = Field(default=True) # Whether to use Vertex models for Gemini or Google models
30
+ use_vertex: bool = Field(default=False) # Whether to use Vertex models for Gemini or Google models
31
31
  gemini_credentials_path: str | None = Field(default=None) # Path to Gemini credentials file
32
32
  api_base: str | None = Field(default=None) # API base URL for vLLM
33
33
 
@@ -1,6 +1,9 @@
1
1
  import logging
2
+ import os
2
3
  from enum import Enum
3
4
 
5
+ from dotenv import load_dotenv
6
+
4
7
  from palimpzest.core.data.dataset import Dataset
5
8
  from palimpzest.core.elements.records import DataRecordCollection
6
9
  from palimpzest.query.execution.execution_strategy import ExecutionStrategy, SentinelExecutionStrategy
@@ -91,6 +94,27 @@ class QueryProcessorFactory:
91
94
  # set the final set of available models in the config
92
95
  config.available_models = available_models
93
96
 
97
+ if len(config.available_models) == 0:
98
+ raise ValueError("No available models found.")
99
+
100
+ openai_key = os.getenv("OPENAI_API_KEY")
101
+ anthropic_key = os.getenv("ANTHROPIC_API_KEY")
102
+ together_key = os.getenv("TOGETHER_API_KEY")
103
+ gemini_key = os.getenv("GEMINI_API_KEY")
104
+ google_key = os.getenv("GOOGLE_API_KEY")
105
+
106
+ for model in config.available_models:
107
+ if model.is_openai_model() and not openai_key:
108
+ raise ValueError("OPENAI_API_KEY must be set to use OpenAI models.")
109
+ if model.is_anthropic_model() and not anthropic_key:
110
+ raise ValueError("ANTHROPIC_API_KEY must be set to use Anthropic models.")
111
+ if model.is_together_model() and not together_key:
112
+ raise ValueError("TOGETHER_API_KEY must be set to use Together models.")
113
+ if model.is_google_ai_studio_model() and not (gemini_key or google_key or config.gemini_credentials_path):
114
+ raise ValueError("GEMINI_API_KEY, GOOGLE_API_KEY, or gemini_credentials path must be set to use Google Gemini models.")
115
+ if model.is_vllm_model() and config.api_base is None:
116
+ raise ValueError("api_base must be set to use vLLM models.")
117
+
94
118
  return config, validator
95
119
 
96
120
  @classmethod
@@ -172,6 +196,7 @@ class QueryProcessorFactory:
172
196
  train_dataset: dict[str, Dataset] | None = None,
173
197
  validator: Validator | None = None,
174
198
  ) -> DataRecordCollection:
199
+ load_dotenv(override=True)
175
200
  logger.info(f"Creating processor for dataset: {dataset}")
176
201
  processor = cls.create_processor(dataset, config, train_dataset, validator)
177
202
  logger.info(f"Created processor: {processor}")
@@ -3,13 +3,12 @@ import os
3
3
  from palimpzest.constants import Model
4
4
 
5
5
 
6
- # TODO: better handle vertex vs. google for gemini models
7
- def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_credentials_path: str | None = None, api_base: str | None = None) -> list[Model]:
6
+ def get_models(include_embedding: bool = False, use_vertex: bool = False, gemini_credentials_path: str | None = None, api_base: str | None = None) -> list[Model]:
8
7
  """
9
8
  Return the set of models which the system has access to based on the set environment variables.
10
9
  """
11
10
  models = []
12
- if os.getenv("OPENAI_API_KEY") is not None:
11
+ if os.getenv("OPENAI_API_KEY") not in [None, ""]:
13
12
  openai_models = [model for model in Model if model.is_openai_model()]
14
13
  if not include_embedding:
15
14
  openai_models = [
@@ -17,7 +16,7 @@ def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_
17
16
  ]
18
17
  models.extend(openai_models)
19
18
 
20
- if os.getenv("TOGETHER_API_KEY") is not None:
19
+ if os.getenv("TOGETHER_API_KEY") not in [None, ""]:
21
20
  together_models = [model for model in Model if model.is_together_model()]
22
21
  if not include_embedding:
23
22
  together_models = [
@@ -25,7 +24,7 @@ def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_
25
24
  ]
26
25
  models.extend(together_models)
27
26
 
28
- if os.getenv("ANTHROPIC_API_KEY") is not None:
27
+ if os.getenv("ANTHROPIC_API_KEY") not in [None, ""]:
29
28
  anthropic_models = [model for model in Model if model.is_anthropic_model()]
30
29
  if not include_embedding:
31
30
  anthropic_models = [
@@ -38,9 +37,9 @@ def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_
38
37
  if gemini_credentials_path is None
39
38
  else gemini_credentials_path
40
39
  )
41
- if os.getenv("GEMINI_API_KEY") is not None or os.path.exists(gemini_credentials_path):
40
+ if os.getenv("GEMINI_API_KEY") not in [None, ""] or (use_vertex and os.path.exists(gemini_credentials_path)):
42
41
  vertex_models = [model for model in Model if model.is_vertex_model()]
43
- google_models = [model for model in Model if model.is_google_model()]
42
+ google_ai_studio_models = [model for model in Model if model.is_google_ai_studio_model()]
44
43
  if not include_embedding:
45
44
  vertex_models = [
46
45
  model for model in vertex_models if not model.is_embedding_model()
@@ -48,7 +47,7 @@ def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_
48
47
  if use_vertex:
49
48
  models.extend(vertex_models)
50
49
  else:
51
- models.extend(google_models)
50
+ models.extend(google_ai_studio_models)
52
51
 
53
52
  if api_base is not None:
54
53
  vllm_models = [model for model in Model if model.is_vllm_model()]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 1.1.1
3
+ Version: 1.3.0
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -31,9 +31,10 @@ Requires-Dist: pillow>=11.3.0
31
31
  Requires-Dist: prettytable>=3.9.0
32
32
  Requires-Dist: psutil==5.9.5
33
33
  Requires-Dist: PyLD>=2.0.4
34
- Requires-Dist: pyarrow==20.0.0
34
+ Requires-Dist: pyarrow>=20.0.0
35
35
  Requires-Dist: pypdf>=5.1.0
36
36
  Requires-Dist: pytest-mock>=3.14.0
37
+ Requires-Dist: python-dotenv>=1.2.1
37
38
  Requires-Dist: pyyaml>=6.0.1
38
39
  Requires-Dist: requests>=2.25
39
40
  Requires-Dist: ruff>=0.9.0
@@ -14,9 +14,10 @@ pillow>=11.3.0
14
14
  prettytable>=3.9.0
15
15
  psutil==5.9.5
16
16
  PyLD>=2.0.4
17
- pyarrow==20.0.0
17
+ pyarrow>=20.0.0
18
18
  pypdf>=5.1.0
19
19
  pytest-mock>=3.14.0
20
+ python-dotenv>=1.2.1
20
21
  pyyaml>=6.0.1
21
22
  requests>=2.25
22
23
  ruff>=0.9.0
File without changes
File without changes
File without changes