palimpzest 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
palimpzest/constants.py CHANGED
@@ -18,8 +18,12 @@ class Model(str, Enum):
18
18
  DEEPSEEK_R1_DISTILL_QWEN_1_5B = "together_ai/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
19
19
  GPT_4o = "openai/gpt-4o-2024-08-06"
20
20
  GPT_4o_MINI = "openai/gpt-4o-mini-2024-07-18"
21
- GPT_5 = "openai/gpt-5"
22
- GPT_5_MINI = "openai/gpt-5-mini"
21
+ GPT_4_1 = "openai/gpt-4.1-2025-04-14"
22
+ GPT_4_1_MINI = "openai/gpt-4.1-mini-2025-04-14"
23
+ GPT_4_1_NANO = "openai/gpt-4.1-nano-2025-04-14"
24
+ GPT_5 = "openai/gpt-5-2025-08-07"
25
+ GPT_5_MINI = "openai/gpt-5-mini-2025-08-07"
26
+ GPT_5_NANO = "openai/gpt-5-nano-2025-08-07"
23
27
  o4_MINI = "openai/o4-mini-2025-04-16" # noqa: N815
24
28
  TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
25
29
  CLIP_VIT_B_32 = "clip-ViT-B-32"
@@ -29,6 +33,9 @@ class Model(str, Enum):
29
33
  GEMINI_2_0_FLASH = "vertex_ai/gemini-2.0-flash"
30
34
  GEMINI_2_5_FLASH = "vertex_ai/gemini-2.5-flash"
31
35
  GEMINI_2_5_PRO = "vertex_ai/gemini-2.5-pro"
36
+ GOOGLE_GEMINI_2_5_FLASH = "google/gemini-2.5-flash"
37
+ GOOGLE_GEMINI_2_5_FLASH_LITE = "google/gemini-2.5-flash-lite"
38
+ GOOGLE_GEMINI_2_5_PRO = "google/gemini-2.5-pro"
32
39
  LLAMA_4_MAVERICK = "vertex_ai/meta/llama-4-maverick-17b-128e-instruct-maas"
33
40
  GPT_4o_AUDIO_PREVIEW = "openai/gpt-4o-audio-preview"
34
41
  GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
@@ -54,7 +61,7 @@ class Model(str, Enum):
54
61
  return self in [Model.o4_MINI]
55
62
 
56
63
  def is_gpt_5_model(self):
57
- return self in [Model.GPT_5, Model.GPT_5_MINI]
64
+ return self in [Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO]
58
65
 
59
66
  def is_openai_model(self):
60
67
  return "openai" in self.value.lower() or self.is_text_embedding_model()
@@ -65,13 +72,17 @@ class Model(str, Enum):
65
72
  def is_vertex_model(self):
66
73
  return "vertex_ai" in self.value.lower()
67
74
 
75
+ def is_google_model(self):
76
+ return "google" in self.value.lower()
77
+
68
78
  def is_vllm_model(self):
69
79
  return "hosted_vllm" in self.value.lower()
70
80
 
71
81
  def is_reasoning_model(self):
72
82
  reasoning_models = [
73
- Model.GPT_5, Model.GPT_5_MINI, Model.o4_MINI,
83
+ Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO, Model.o4_MINI,
74
84
  Model.GEMINI_2_5_PRO, Model.GEMINI_2_5_FLASH,
85
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
75
86
  Model.CLAUDE_3_7_SONNET,
76
87
  ]
77
88
  return self in reasoning_models
@@ -88,27 +99,31 @@ class Model(str, Enum):
88
99
  def is_vision_model(self):
89
100
  return self in [
90
101
  Model.LLAMA3_2_90B_V, Model.LLAMA_4_MAVERICK,
91
- Model.GPT_4o, Model.GPT_4o_MINI, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI,
102
+ Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4_1, Model.GPT_4_1_MINI, Model.GPT_4_1_NANO, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO,
92
103
  Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
104
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
93
105
  ]
94
106
 
95
107
  def is_audio_model(self):
96
108
  return self in [
97
109
  Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
98
110
  Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
111
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
99
112
  ]
100
113
 
101
114
  def is_text_image_multimodal_model(self):
102
115
  return self in [
103
116
  Model.LLAMA_4_MAVERICK,
104
- Model.GPT_4o, Model.GPT_4o_MINI, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI,
117
+ Model.GPT_4o, Model.GPT_4o_MINI, Model.GPT_4_1, Model.GPT_4_1_MINI, Model.GPT_4_1_NANO, Model.o4_MINI, Model.GPT_5, Model.GPT_5_MINI, Model.GPT_5_NANO,
105
118
  Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
119
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
106
120
  ]
107
121
 
108
122
  def is_text_audio_multimodal_model(self):
109
123
  return self in [
110
124
  Model.GPT_4o_AUDIO_PREVIEW, Model.GPT_4o_MINI_AUDIO_PREVIEW,
111
125
  Model.GEMINI_2_0_FLASH, Model.GEMINI_2_5_FLASH, Model.GEMINI_2_5_PRO,
126
+ Model.GOOGLE_GEMINI_2_5_PRO, Model.GOOGLE_GEMINI_2_5_FLASH, Model.GOOGLE_GEMINI_2_5_FLASH_LITE,
112
127
  ]
113
128
 
114
129
  def is_embedding_model(self):
@@ -327,7 +342,7 @@ LLAMA3_2_3B_INSTRUCT_MODEL_CARD = {
327
342
  "usd_per_input_token": 0.06 / 1e6,
328
343
  "usd_per_output_token": 0.06 / 1e6,
329
344
  ##### Time #####
330
- "seconds_per_output_token": 0.0064,
345
+ "seconds_per_output_token": 0.0079,
331
346
  ##### Agg. Benchmark #####
332
347
  "overall": 36.50, # https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/discussions/13
333
348
  }
@@ -336,7 +351,7 @@ LLAMA3_1_8B_INSTRUCT_MODEL_CARD = {
336
351
  "usd_per_input_token": 0.18 / 1e6,
337
352
  "usd_per_output_token": 0.18 / 1e6,
338
353
  ##### Time #####
339
- "seconds_per_output_token": 0.0059,
354
+ "seconds_per_output_token": 0.0050,
340
355
  ##### Agg. Benchmark #####
341
356
  "overall": 44.25,
342
357
  }
@@ -345,7 +360,7 @@ LLAMA3_3_70B_INSTRUCT_MODEL_CARD = {
345
360
  "usd_per_input_token": 0.88 / 1e6,
346
361
  "usd_per_output_token": 0.88 / 1e6,
347
362
  ##### Time #####
348
- "seconds_per_output_token": 0.0139,
363
+ "seconds_per_output_token": 0.0122,
349
364
  ##### Agg. Benchmark #####
350
365
  "overall": 69.9,
351
366
  }
@@ -354,7 +369,7 @@ LLAMA3_2_90B_V_MODEL_CARD = {
354
369
  "usd_per_input_token": 1.2 / 1e6,
355
370
  "usd_per_output_token": 1.2 / 1e6,
356
371
  ##### Time #####
357
- "seconds_per_output_token": 0.0222,
372
+ "seconds_per_output_token": 0.0303,
358
373
  ##### Agg. Benchmark #####
359
374
  "overall": 65.00, # set to be slightly higher than gpt-4o-mini
360
375
  }
@@ -363,7 +378,7 @@ DEEPSEEK_V3_MODEL_CARD = {
363
378
  "usd_per_input_token": 1.25 / 1E6,
364
379
  "usd_per_output_token": 1.25 / 1E6,
365
380
  ##### Time #####
366
- "seconds_per_output_token": 0.0769,
381
+ "seconds_per_output_token": 0.0114,
367
382
  ##### Agg. Benchmark #####
368
383
  "overall": 73.8,
369
384
  }
@@ -372,7 +387,7 @@ DEEPSEEK_R1_DISTILL_QWEN_1_5B_MODEL_CARD = {
372
387
  "usd_per_input_token": 0.18 / 1E6,
373
388
  "usd_per_output_token": 0.18 / 1E6,
374
389
  ##### Time #####
375
- "seconds_per_output_token": 0.0026,
390
+ "seconds_per_output_token": 0.0050, # NOTE: copied to be same as LLAMA3_1_8B_INSTRUCT_MODEL_CARD; need to update when we have data
376
391
  ##### Agg. Benchmark #####
377
392
  "overall": 39.90, # https://www.reddit.com/r/LocalLLaMA/comments/1iserf9/deepseek_r1_distilled_models_mmlu_pro_benchmarks/
378
393
  }
@@ -382,7 +397,7 @@ GPT_4o_AUDIO_PREVIEW_MODEL_CARD = {
382
397
  "usd_per_audio_input_token": 2.5 / 1e6,
383
398
  "usd_per_output_token": 10.0 / 1e6,
384
399
  ##### Time #####
385
- "seconds_per_output_token": 0.0079,
400
+ "seconds_per_output_token": 0.0080,
386
401
  ##### Agg. Benchmark #####
387
402
  "overall": 74.1,
388
403
  }
@@ -392,7 +407,7 @@ GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD = {
392
407
  "usd_per_audio_input_token": 0.15 / 1e6,
393
408
  "usd_per_output_token": 0.6 / 1e6,
394
409
  ##### Time #####
395
- "seconds_per_output_token": 0.0098,
410
+ "seconds_per_output_token": 0.0159,
396
411
  ##### Agg. Benchmark #####
397
412
  "overall": 62.7,
398
413
  }
@@ -402,7 +417,7 @@ GPT_4o_MODEL_CARD = {
402
417
  "usd_per_input_token": 2.5 / 1e6,
403
418
  "usd_per_output_token": 10.0 / 1e6,
404
419
  ##### Time #####
405
- "seconds_per_output_token": 0.0079,
420
+ "seconds_per_output_token": 0.0080,
406
421
  ##### Agg. Benchmark #####
407
422
  "overall": 74.1,
408
423
  }
@@ -412,17 +427,47 @@ GPT_4o_MINI_MODEL_CARD = {
412
427
  "usd_per_input_token": 0.15 / 1e6,
413
428
  "usd_per_output_token": 0.6 / 1e6,
414
429
  ##### Time #####
415
- "seconds_per_output_token": 0.0098,
430
+ "seconds_per_output_token": 0.0159,
416
431
  ##### Agg. Benchmark #####
417
432
  "overall": 62.7,
418
433
  }
434
+ GPT_4_1_MODEL_CARD = {
435
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
436
+ ##### Cost in USD #####
437
+ "usd_per_input_token": 2.0 / 1e6,
438
+ "usd_per_output_token": 8.0 / 1e6,
439
+ ##### Time #####
440
+ "seconds_per_output_token": 0.0076,
441
+ ##### Agg. Benchmark #####
442
+ "overall": 80.5,
443
+ }
444
+ GPT_4_1_MINI_MODEL_CARD = {
445
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
446
+ ##### Cost in USD #####
447
+ "usd_per_input_token": 0.4 / 1e6,
448
+ "usd_per_output_token": 1.6 / 1e6,
449
+ ##### Time #####
450
+ "seconds_per_output_token": 0.0161,
451
+ ##### Agg. Benchmark #####
452
+ "overall": 77.2,
453
+ }
454
+ GPT_4_1_NANO_MODEL_CARD = {
455
+ # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
456
+ ##### Cost in USD #####
457
+ "usd_per_input_token": 0.1 / 1e6,
458
+ "usd_per_output_token": 0.4 / 1e6,
459
+ ##### Time #####
460
+ "seconds_per_output_token": 0.0060,
461
+ ##### Agg. Benchmark #####
462
+ "overall": 62.3,
463
+ }
419
464
  GPT_5_MODEL_CARD = {
420
465
  # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
421
466
  ##### Cost in USD #####
422
467
  "usd_per_input_token": 1.25 / 1e6,
423
468
  "usd_per_output_token": 10.0 / 1e6,
424
469
  ##### Time #####
425
- "seconds_per_output_token": 0.0139,
470
+ "seconds_per_output_token": 0.0060,
426
471
  ##### Agg. Benchmark #####
427
472
  "overall": 87.00,
428
473
  }
@@ -432,30 +477,40 @@ GPT_5_MINI_MODEL_CARD = {
432
477
  "usd_per_input_token": 0.25 / 1e6,
433
478
  "usd_per_output_token": 2.0 / 1e6,
434
479
  ##### Time #####
435
- "seconds_per_output_token": 0.0094,
480
+ "seconds_per_output_token": 0.0135,
436
481
  ##### Agg. Benchmark #####
437
482
  "overall": 82.50,
438
483
  }
439
- o4_MINI_MODEL_CARD = { # noqa: N816
484
+ GPT_5_NANO_MODEL_CARD = {
440
485
  # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
441
486
  ##### Cost in USD #####
442
- "usd_per_input_token": 1.1 / 1e6,
443
- "usd_per_output_token": 4.4 / 1e6,
487
+ "usd_per_input_token": 0.05 / 1e6,
488
+ "usd_per_output_token": 0.4 / 1e6,
444
489
  ##### Time #####
445
- "seconds_per_output_token": 0.0093,
490
+ "seconds_per_output_token": 0.0055,
446
491
  ##### Agg. Benchmark #####
447
- "overall": 80.6, # using number reported for o3-mini; true number is likely higher
492
+ "overall": 77.9,
448
493
  }
449
- o1_MODEL_CARD = { # noqa: N816
494
+ o4_MINI_MODEL_CARD = { # noqa: N816
450
495
  # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
451
496
  ##### Cost in USD #####
452
- "usd_per_input_token": 15 / 1e6,
453
- "usd_per_output_token": 60 / 1e6,
497
+ "usd_per_input_token": 1.1 / 1e6,
498
+ "usd_per_output_token": 4.4 / 1e6,
454
499
  ##### Time #####
455
- "seconds_per_output_token": 0.0110,
500
+ "seconds_per_output_token": 0.0092,
456
501
  ##### Agg. Benchmark #####
457
- "overall": 83.50,
502
+ "overall": 80.6, # using number reported for o3-mini; true number is likely higher
458
503
  }
504
+ # o1_MODEL_CARD = { # noqa: N816
505
+ # # NOTE: it is unclear if the same ($ / token) costs can be applied for vision, or if we have to calculate this ourselves
506
+ # ##### Cost in USD #####
507
+ # "usd_per_input_token": 15 / 1e6,
508
+ # "usd_per_output_token": 60 / 1e6,
509
+ # ##### Time #####
510
+ # "seconds_per_output_token": 0.0110,
511
+ # ##### Agg. Benchmark #####
512
+ # "overall": 83.50,
513
+ # }
459
514
  TEXT_EMBEDDING_3_SMALL_MODEL_CARD = {
460
515
  ##### Cost in USD #####
461
516
  "usd_per_input_token": 0.02 / 1e6,
@@ -479,7 +534,7 @@ CLAUDE_3_5_SONNET_MODEL_CARD = {
479
534
  "usd_per_input_token": 3.0 / 1e6,
480
535
  "usd_per_output_token": 15.0 / 1e6,
481
536
  ##### Time #####
482
- "seconds_per_output_token": 0.0127,
537
+ "seconds_per_output_token": 0.0154,
483
538
  ##### Agg. Benchmark #####
484
539
  "overall": 78.4,
485
540
  }
@@ -488,7 +543,7 @@ CLAUDE_3_7_SONNET_MODEL_CARD = {
488
543
  "usd_per_input_token": 3.0 / 1e6,
489
544
  "usd_per_output_token": 15.0 / 1e6,
490
545
  ##### Time #####
491
- "seconds_per_output_token": 0.0130,
546
+ "seconds_per_output_token": 0.0156,
492
547
  ##### Agg. Benchmark #####
493
548
  "overall": 80.7,
494
549
  }
@@ -497,7 +552,7 @@ CLAUDE_3_5_HAIKU_MODEL_CARD = {
497
552
  "usd_per_input_token": 0.8 / 1e6,
498
553
  "usd_per_output_token": 4.0 / 1e6,
499
554
  ##### Time #####
500
- "seconds_per_output_token": 0.0152,
555
+ "seconds_per_output_token": 0.0189,
501
556
  ##### Agg. Benchmark #####
502
557
  "overall": 64.1,
503
558
  }
@@ -507,17 +562,27 @@ GEMINI_2_0_FLASH_MODEL_CARD = {
507
562
  "usd_per_output_token": 0.6 / 1e6,
508
563
  "usd_per_audio_input_token": 1.0 / 1e6,
509
564
  ##### Time #####
510
- "seconds_per_output_token": 0.0049,
565
+ "seconds_per_output_token": 0.0054,
511
566
  ##### Agg. Benchmark #####
512
567
  "overall": 77.40,
513
568
  }
569
+ GEMINI_2_5_FLASH_LITE_MODEL_CARD = {
570
+ ##### Cost in USD #####
571
+ "usd_per_input_token": 0.1 / 1e6,
572
+ "usd_per_output_token": 0.4 / 1e6,
573
+ "usd_per_audio_input_token": 0.3 / 1e6,
574
+ ##### Time #####
575
+ "seconds_per_output_token": 0.0034,
576
+ ##### Agg. Benchmark #####
577
+ "overall": 79.1, # NOTE: interpolated between gemini 2.5 flash and gemini 2.0 flash
578
+ }
514
579
  GEMINI_2_5_FLASH_MODEL_CARD = {
515
580
  ##### Cost in USD #####
516
581
  "usd_per_input_token": 0.30 / 1e6,
517
582
  "usd_per_output_token": 2.5 / 1e6,
518
583
  "usd_per_audio_input_token": 1.0 / 1e6,
519
584
  ##### Time #####
520
- "seconds_per_output_token": 0.0039,
585
+ "seconds_per_output_token": 0.0044,
521
586
  ##### Agg. Benchmark #####
522
587
  "overall": 80.75, # NOTE: interpolated between gemini 2.0 flash and gemini 2.5 pro
523
588
  }
@@ -527,7 +592,7 @@ GEMINI_2_5_PRO_MODEL_CARD = {
527
592
  "usd_per_output_token": 10.0 / 1e6,
528
593
  "usd_per_audio_input_token": 1.25 / 1e6,
529
594
  ##### Time #####
530
- "seconds_per_output_token": 0.0070,
595
+ "seconds_per_output_token": 0.0072,
531
596
  ##### Agg. Benchmark #####
532
597
  "overall": 84.10,
533
598
  }
@@ -536,7 +601,7 @@ LLAMA_4_MAVERICK_MODEL_CARD = {
536
601
  "usd_per_input_token": 0.35 / 1e6,
537
602
  "usd_per_output_token": 1.15 / 1e6,
538
603
  ##### Time #####
539
- "seconds_per_output_token": 0.0058,
604
+ "seconds_per_output_token": 0.0122,
540
605
  ##### Agg. Benchmark #####
541
606
  "overall": 79.4,
542
607
  }
@@ -561,8 +626,12 @@ MODEL_CARDS = {
561
626
  Model.GPT_4o_MINI.value: GPT_4o_MINI_MODEL_CARD,
562
627
  Model.GPT_4o_AUDIO_PREVIEW.value: GPT_4o_AUDIO_PREVIEW_MODEL_CARD,
563
628
  Model.GPT_4o_MINI_AUDIO_PREVIEW.value: GPT_4o_MINI_AUDIO_PREVIEW_MODEL_CARD,
629
+ Model.GPT_4_1.value: GPT_4_1_MODEL_CARD,
630
+ Model.GPT_4_1_MINI.value: GPT_4_1_MINI_MODEL_CARD,
631
+ Model.GPT_4_1_NANO.value: GPT_4_1_NANO_MODEL_CARD,
564
632
  Model.GPT_5.value: GPT_5_MODEL_CARD,
565
633
  Model.GPT_5_MINI.value: GPT_5_MINI_MODEL_CARD,
634
+ Model.GPT_5_NANO.value: GPT_5_NANO_MODEL_CARD,
566
635
  Model.o4_MINI.value: o4_MINI_MODEL_CARD,
567
636
  # Model.o1.value: o1_MODEL_CARD,
568
637
  Model.TEXT_EMBEDDING_3_SMALL.value: TEXT_EMBEDDING_3_SMALL_MODEL_CARD,
@@ -573,6 +642,9 @@ MODEL_CARDS = {
573
642
  Model.GEMINI_2_0_FLASH.value: GEMINI_2_0_FLASH_MODEL_CARD,
574
643
  Model.GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
575
644
  Model.GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
645
+ Model.GOOGLE_GEMINI_2_5_FLASH.value: GEMINI_2_5_FLASH_MODEL_CARD,
646
+ Model.GOOGLE_GEMINI_2_5_FLASH_LITE.value: GEMINI_2_5_FLASH_LITE_MODEL_CARD,
647
+ Model.GOOGLE_GEMINI_2_5_PRO.value: GEMINI_2_5_PRO_MODEL_CARD,
576
648
  Model.LLAMA_4_MAVERICK.value: LLAMA_4_MAVERICK_MODEL_CARD,
577
649
  Model.VLLM_QWEN_1_5_0_5B_CHAT.value: VLLM_QWEN_1_5_0_5B_CHAT_MODEL_CARD,
578
650
  }
@@ -228,7 +228,7 @@ class Dataset:
228
228
  id=self.id,
229
229
  )
230
230
 
231
- def sem_join(self, other: Dataset, condition: str, depends_on: str | list[str] | None = None) -> Dataset:
231
+ def sem_join(self, other: Dataset, condition: str, desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
232
232
  """
233
233
  Perform a semantic (inner) join on the specified join predicate
234
234
  """
@@ -244,6 +244,7 @@ class Dataset:
244
244
  input_schema=combined_schema,
245
245
  output_schema=combined_schema,
246
246
  condition=condition,
247
+ desc=desc,
247
248
  depends_on=depends_on,
248
249
  )
249
250
 
@@ -277,6 +278,7 @@ class Dataset:
277
278
  def sem_filter(
278
279
  self,
279
280
  filter: str,
281
+ desc: str | None = None,
280
282
  depends_on: str | list[str] | None = None,
281
283
  ) -> Dataset:
282
284
  """Add a natural language description of a filter to the Set. This filter will possibly restrict the items that are returned later."""
@@ -292,12 +294,13 @@ class Dataset:
292
294
  depends_on = [depends_on]
293
295
 
294
296
  # construct logical operator
295
- operator = FilteredScan(input_schema=self.schema, output_schema=self.schema, filter=f, depends_on=depends_on)
297
+ operator = FilteredScan(input_schema=self.schema, output_schema=self.schema, filter=f, desc=desc, depends_on=depends_on)
296
298
 
297
299
  return Dataset(sources=[self], operator=operator, schema=self.schema)
298
300
 
299
301
  def _sem_map(self, cols: list[dict] | type[BaseModel] | None,
300
302
  cardinality: Cardinality,
303
+ desc: str | None = None,
301
304
  depends_on: str | list[str] | None = None) -> Dataset:
302
305
  """Execute the semantic map operation with the appropriate cardinality."""
303
306
  # construct new output schema
@@ -322,6 +325,7 @@ class Dataset:
322
325
  output_schema=new_output_schema,
323
326
  cardinality=cardinality,
324
327
  udf=None,
328
+ desc=desc,
325
329
  depends_on=depends_on,
326
330
  )
327
331
 
@@ -330,6 +334,7 @@ class Dataset:
330
334
 
331
335
  def sem_add_columns(self, cols: list[dict] | type[BaseModel],
332
336
  cardinality: Cardinality = Cardinality.ONE_TO_ONE,
337
+ desc: str | None = None,
333
338
  depends_on: str | list[str] | None = None) -> Dataset:
334
339
  """
335
340
  NOTE: we are renaming this function to `sem_map` and deprecating `sem_add_columns` in the next
@@ -354,9 +359,9 @@ class Dataset:
354
359
  stacklevel=2
355
360
  )
356
361
 
357
- return self._sem_map(cols, cardinality, depends_on)
362
+ return self._sem_map(cols, cardinality, desc, depends_on)
358
363
 
359
- def sem_map(self, cols: list[dict] | type[BaseModel], depends_on: str | list[str] | None = None) -> Dataset:
364
+ def sem_map(self, cols: list[dict] | type[BaseModel], desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
360
365
  """
361
366
  Compute new field(s) by specifying their names, descriptions, and types. For each input there will
362
367
  be one output. The field(s) will be computed during the execution of the Dataset.
@@ -368,9 +373,9 @@ class Dataset:
368
373
  {'name': 'full_name', 'desc': 'The name of the person', 'type': str}]
369
374
  )
370
375
  """
371
- return self._sem_map(cols, Cardinality.ONE_TO_ONE, depends_on)
376
+ return self._sem_map(cols, Cardinality.ONE_TO_ONE, desc, depends_on)
372
377
 
373
- def sem_flat_map(self, cols: list[dict] | type[BaseModel], depends_on: str | list[str] | None = None) -> Dataset:
378
+ def sem_flat_map(self, cols: list[dict] | type[BaseModel], desc: str | None = None, depends_on: str | list[str] | None = None) -> Dataset:
374
379
  """
375
380
  Compute new field(s) by specifying their names, descriptions, and types. For each input there will
376
381
  be one or more output(s). The field(s) will be computed during the execution of the Dataset.
@@ -384,7 +389,7 @@ class Dataset:
384
389
  ]
385
390
  )
386
391
  """
387
- return self._sem_map(cols, Cardinality.ONE_TO_MANY, depends_on)
392
+ return self._sem_map(cols, Cardinality.ONE_TO_MANY, desc, depends_on)
388
393
 
389
394
  def _map(self, udf: Callable,
390
395
  cols: list[dict] | type[BaseModel] | None,
@@ -590,7 +595,7 @@ class Dataset:
590
595
 
591
596
  return QueryProcessorFactory.create_and_run_processor(self, config)
592
597
 
593
- def optimize_and_run(self, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, config: QueryProcessorConfig | None = None, **kwargs):
598
+ def optimize_and_run(self, config: QueryProcessorConfig | None = None, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, **kwargs):
594
599
  """Optimize the PZ program using the train_dataset and validator before running the optimized plan."""
595
600
  # TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR
596
601
  from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory
@@ -53,7 +53,7 @@ ANSWER:
53
53
  COT_QA_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
54
54
  You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
55
55
  You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
56
-
56
+ {desc_section}
57
57
  {output_format_instruction} Finish your response with a newline character followed by ---
58
58
  ---
59
59
  INPUT FIELDS:
@@ -72,7 +72,7 @@ REASONING: """
72
72
  COT_QA_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
73
73
  You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
74
74
  You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
75
-
75
+ {desc_section}
76
76
  {output_format_instruction} Finish your response with a newline character followed by ---
77
77
  ---
78
78
  INPUT FIELDS:
@@ -45,7 +45,7 @@ ANSWER: TRUE
45
45
 
46
46
  COT_BOOL_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
47
47
  You will be presented with a context and a filter condition. Output TRUE if the context satisfies the filter condition, and FALSE otherwise.
48
-
48
+ {desc_section}
49
49
  Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
50
50
  ---
51
51
  INPUT FIELDS:
@@ -62,7 +62,7 @@ REASONING: """
62
62
 
63
63
  COT_BOOL_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
64
64
  You will be presented with a context and a filter condition. Output TRUE if the context satisfies the filter condition, and FALSE otherwise.
65
-
65
+ {desc_section}
66
66
  Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
67
67
  ---
68
68
  INPUT FIELDS:
@@ -57,7 +57,7 @@ ANSWER: TRUE
57
57
 
58
58
  COT_JOIN_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
59
59
  You will be presented with two data records and a join condition. Output TRUE if the two data records satisfy the join condition, and FALSE otherwise.
60
-
60
+ {desc_section}
61
61
  Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
62
62
  ---
63
63
  LEFT INPUT FIELDS:
@@ -80,7 +80,7 @@ REASONING: """
80
80
 
81
81
  COT_JOIN_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
82
82
  You will be presented with two data records and a join condition. Output TRUE if the two data records satisfy the join condition, and FALSE otherwise.
83
-
83
+ {desc_section}
84
84
  Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
85
85
  ---
86
86
  LEFT INPUT FIELDS:
@@ -27,7 +27,7 @@ ANSWER: {example_answer}
27
27
  COT_MOA_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
28
28
  You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
29
29
  Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
30
-
30
+ {desc_section}
31
31
  You will be provided with a description of each input field and each output field.
32
32
  ---
33
33
  INPUT FIELDS:
@@ -138,6 +138,7 @@ from palimpzest.prompts.split_proposer_prompts import (
138
138
  SPLIT_PROPOSER_JOB_INSTRUCTION,
139
139
  )
140
140
  from palimpzest.prompts.util_phrases import (
141
+ DESC_SECTION,
141
142
  ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION,
142
143
  ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION,
143
144
  )
@@ -205,10 +206,11 @@ class PromptFactory:
205
206
  PromptStrategy.SPLIT_MERGER: COT_SPLIT_MERGER_BASE_USER_PROMPT,
206
207
  }
207
208
 
208
- def __init__(self, prompt_strategy: PromptStrategy, model: Model, cardinality: Cardinality) -> None:
209
+ def __init__(self, prompt_strategy: PromptStrategy, model: Model, cardinality: Cardinality, desc: str | None = None) -> None:
209
210
  self.prompt_strategy = prompt_strategy
210
211
  self.model = model
211
212
  self.cardinality = cardinality
213
+ self.desc = desc
212
214
 
213
215
  def _get_context(self, candidate: DataRecord, input_fields: list[str]) -> str:
214
216
  """
@@ -446,6 +448,19 @@ class PromptFactory:
446
448
  }
447
449
  return prompt_strategy_to_job_instruction.get(self.prompt_strategy)
448
450
 
451
+ def _get_desc_section(self) -> str:
452
+ """
453
+ Returns the description section for the prompt.
454
+
455
+ Returns:
456
+ str: The description section (if applicable).
457
+ """
458
+ desc_section = ""
459
+ if self.desc is not None:
460
+ desc_section = DESC_SECTION.format(desc=self.desc)
461
+
462
+ return desc_section
463
+
449
464
  def _get_critique_criteria(self) -> str | None:
450
465
  """
451
466
  Returns the critique criteria for the critique operation.
@@ -758,6 +773,7 @@ class PromptFactory:
758
773
  prompt_strategy_format_kwargs = {
759
774
  "output_format_instruction": self._get_output_format_instruction(),
760
775
  "job_instruction": self._get_job_instruction(),
776
+ "desc_section": self._get_desc_section(),
761
777
  "critique_criteria": self._get_critique_criteria(),
762
778
  "refinement_criteria": self._get_refinement_criteria(),
763
779
  "finish_instruction": self._get_finish_instruction(),
@@ -27,7 +27,7 @@ ANSWER: {example_answer}
27
27
  COT_SPLIT_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
28
28
  You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
29
29
  Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
30
-
30
+ {desc_section}
31
31
  You will be provided with a description of each input field and each output field.
32
32
  ---
33
33
  INPUT FIELDS:
@@ -12,3 +12,8 @@ REASONING: """
12
12
  COT_ANSWER_INSTRUCTION = """Let's think step-by-step in order to answer the question.
13
13
 
14
14
  ANSWER: """
15
+
16
+ DESC_SECTION = """
17
+ The user has additionally provided you with this description of the task you need to perform:
18
+ {desc}
19
+ """
@@ -2,16 +2,19 @@
2
2
  import logging
3
3
 
4
4
  import numpy as np
5
+ from chromadb.api.models.Collection import Collection
5
6
 
6
7
  from palimpzest.core.data.dataset import Dataset
7
8
  from palimpzest.core.elements.records import DataRecord, DataRecordSet
8
- from palimpzest.core.models import OperatorStats, RecordOpStats, SentinelPlanStats
9
+ from palimpzest.core.models import OperatorCostEstimates, OperatorStats, RecordOpStats, SentinelPlanStats
9
10
  from palimpzest.policy import Policy
10
11
  from palimpzest.query.execution.execution_strategy import SentinelExecutionStrategy
11
12
  from palimpzest.query.operators.aggregate import AggregateOp
12
- from palimpzest.query.operators.filter import FilterOp
13
+ from palimpzest.query.operators.convert import LLMConvert
14
+ from palimpzest.query.operators.filter import FilterOp, LLMFilter
13
15
  from palimpzest.query.operators.join import JoinOp
14
16
  from palimpzest.query.operators.physical import PhysicalOperator
17
+ from palimpzest.query.operators.retrieve import RetrieveOp
15
18
  from palimpzest.query.operators.scan import ContextScanOp, ScanPhysicalOp
16
19
  from palimpzest.query.optimizer.plan import SentinelPlan
17
20
  from palimpzest.utils.progress import create_progress_manager
@@ -55,6 +58,17 @@ class OpFrontier:
55
58
  # store the prior beliefs on operator performance (if provided)
56
59
  self.priors = priors
57
60
 
61
+ # boolean indication of the type of operator in this OpFrontier
62
+ sample_op = op_set[0]
63
+ self.is_scan_op = isinstance(sample_op, (ScanPhysicalOp, ContextScanOp))
64
+ self.is_filter_op = isinstance(sample_op, FilterOp)
65
+ self.is_aggregate_op = isinstance(sample_op, AggregateOp)
66
+ self.is_llm_join = isinstance(sample_op, JoinOp)
67
+ is_llm_convert = isinstance(sample_op, LLMConvert)
68
+ is_llm_filter = isinstance(sample_op, LLMFilter)
69
+ is_llm_retrieve = isinstance(sample_op, RetrieveOp) and isinstance(sample_op.index, Collection)
70
+ self.is_llm_op = is_llm_convert or is_llm_filter or is_llm_retrieve or self.is_llm_join
71
+
58
72
  # get order in which we will sample physical operators for this logical operator
59
73
  sample_op_indices = self._get_op_index_order(op_set, seed)
60
74
 
@@ -68,13 +82,6 @@ class OpFrontier:
68
82
  self.full_op_id_to_sources_not_processed = {op.get_full_op_id(): source_indices for op in op_set}
69
83
  self.max_inputs = len(source_indices)
70
84
 
71
- # boolean indication of the type of operator in this OpFrontier
72
- sample_op = op_set[0]
73
- self.is_scan_op = isinstance(sample_op, (ScanPhysicalOp, ContextScanOp))
74
- self.is_filter_op = isinstance(sample_op, FilterOp)
75
- self.is_aggregate_op = isinstance(sample_op, AggregateOp)
76
- self.is_llm_join = isinstance(sample_op, JoinOp)
77
-
78
85
  # set the initial inputs for this logical operator; we maintain a mapping from source_unique_logical_op_id --> source_indices --> input;
79
86
  # for each unique source and (tuple of) source indices, we store its output, which is an input to this operator
80
87
  # for scan operators, we use the default name "source" since these operators have no source
@@ -149,16 +156,44 @@ class OpFrontier:
149
156
 
150
157
  return op_id_to_pareto_distance
151
158
 
159
+ def _compute_naive_priors(self, op_set: list[PhysicalOperator]) -> dict[str, dict[str, float]]:
160
+ naive_priors = {}
161
+ for op in op_set:
162
+ # use naive cost estimates with dummy source estimates to compute priors
163
+ source_op_estimates = OperatorCostEstimates(quality=1.0, cost_per_record=0.0, time_per_record=0.0, cardinality=100)
164
+ op_estimates = (
165
+ op.naive_cost_estimates(source_op_estimates, source_op_estimates)
166
+ if self.is_llm_join
167
+ else op.naive_cost_estimates(source_op_estimates)
168
+ )
169
+
170
+ # get op_id for this operator
171
+ op_id = op.get_op_id()
172
+
173
+ # set the naive quality, cost, and time priors for this operator
174
+ naive_priors[op_id] = {
175
+ "quality": op_estimates.quality,
176
+ "cost": op_estimates.cost_per_record,
177
+ "time": op_estimates.time_per_record,
178
+ }
179
+
180
+ return naive_priors
181
+
152
182
  def _get_op_index_order(self, op_set: list[PhysicalOperator], seed: int) -> list[int]:
153
183
  """
154
184
  Returns a list of indices for the operators in the op_set.
155
185
  """
156
- if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
186
+ # if this is not an llm-operator, we simply return the indices in random order
187
+ if not self.is_llm_op:
157
188
  rng = np.random.default_rng(seed=seed)
158
189
  op_indices = np.arange(len(op_set))
159
190
  rng.shuffle(op_indices)
160
191
  return op_indices
161
192
 
193
+ # if this is an llm-operator, but we do not have priors, we first compute naive priors
194
+ if self.priors is None or any([op_id not in self.priors for op_id in map(lambda op: op.get_op_id(), op_set)]):
195
+ self.priors = self._compute_naive_priors(op_set)
196
+
162
197
  # NOTE: self.priors is a dictionary with format:
163
198
  # {op_id: {"quality": quality, "cost": cost, "time": time}}
164
199
 
@@ -215,7 +250,7 @@ class OpFrontier:
215
250
  op_source_indices_pairs = []
216
251
 
217
252
  # if this operator is not being optimized: we don't request inputs, but simply process what we are given / told to (in the case of scans)
218
- if not self.is_llm_join and len(self.frontier_ops) == 1:
253
+ if not self.is_llm_op and len(self.frontier_ops) == 1:
219
254
  return [(self.frontier_ops[0], None)]
220
255
 
221
256
  # otherwise, sample (operator, source_indices) pairs
@@ -255,16 +290,6 @@ class OpFrontier:
255
290
  all_inputs.extend(inputs)
256
291
  return [(op, tuple(), all_inputs)]
257
292
 
258
- # if this is an un-optimized (non-scan, non-join) operator, flatten inputs and run on each one
259
- elif not self.is_scan_op and not self.is_llm_join and len(self.frontier_ops) == 1:
260
- op_inputs = []
261
- op = self.frontier_ops[0]
262
- for _, source_indices_to_inputs in self.source_indices_to_inputs.items():
263
- for source_indices, inputs in source_indices_to_inputs.items():
264
- for input in inputs:
265
- op_inputs.append((op, source_indices, input))
266
- return op_inputs
267
-
268
293
  ### for optimized operators
269
294
  # get the list of (op, source_indices) pairs which this operator needs to execute
270
295
  op_source_indices_pairs = self._get_op_source_indices_pairs()
@@ -111,6 +111,7 @@ class Generator(Generic[ContextType, InputType]):
111
111
  reasoning_effort: str | None = None,
112
112
  api_base: str | None = None,
113
113
  cardinality: Cardinality = Cardinality.ONE_TO_ONE,
114
+ desc: str | None = None,
114
115
  verbose: bool = False,
115
116
  ):
116
117
  self.model = model
@@ -119,8 +120,9 @@ class Generator(Generic[ContextType, InputType]):
119
120
  self.prompt_strategy = prompt_strategy
120
121
  self.reasoning_effort = reasoning_effort
121
122
  self.api_base = api_base
123
+ self.desc = desc
122
124
  self.verbose = verbose
123
- self.prompt_factory = PromptFactory(prompt_strategy, model, cardinality)
125
+ self.prompt_factory = PromptFactory(prompt_strategy, model, cardinality, desc)
124
126
 
125
127
  def _parse_reasoning(self, completion_text: str, **kwargs) -> str:
126
128
  """Extract the reasoning for the generated output from the completion object."""
@@ -26,18 +26,21 @@ class ConvertOp(PhysicalOperator, ABC):
26
26
  self,
27
27
  cardinality: Cardinality = Cardinality.ONE_TO_ONE,
28
28
  udf: Callable | None = None,
29
+ desc: str | None = None,
29
30
  *args,
30
31
  **kwargs,
31
32
  ):
32
33
  super().__init__(*args, **kwargs)
33
34
  self.cardinality = cardinality
34
35
  self.udf = udf
36
+ self.desc = desc
35
37
 
36
38
  def get_id_params(self):
37
39
  id_params = super().get_id_params()
38
40
  id_params = {
39
41
  "cardinality": self.cardinality.value,
40
42
  "udf": self.udf,
43
+ "desc": self.desc,
41
44
  **id_params,
42
45
  }
43
46
 
@@ -45,7 +48,12 @@ class ConvertOp(PhysicalOperator, ABC):
45
48
 
46
49
  def get_op_params(self):
47
50
  op_params = super().get_op_params()
48
- op_params = {"cardinality": self.cardinality, "udf": self.udf, **op_params}
51
+ op_params = {
52
+ "cardinality": self.cardinality,
53
+ "udf": self.udf,
54
+ "desc": self.desc,
55
+ **op_params,
56
+ }
49
57
 
50
58
  return op_params
51
59
 
@@ -289,7 +297,7 @@ class LLMConvert(ConvertOp):
289
297
  self.prompt_strategy = prompt_strategy
290
298
  self.reasoning_effort = reasoning_effort
291
299
  if model is not None:
292
- self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, self.cardinality, self.verbose)
300
+ self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
293
301
 
294
302
  def __str__(self):
295
303
  op = super().__str__()
@@ -37,8 +37,8 @@ class CriticAndRefineConvert(LLMConvert):
37
37
  raise ValueError(f"Unsupported prompt strategy: {self.prompt_strategy}")
38
38
 
39
39
  # create generators
40
- self.critic_generator = Generator(self.critic_model, self.critic_prompt_strategy, self.reasoning_effort, self.api_base, self.cardinality, self.verbose)
41
- self.refine_generator = Generator(self.refine_model, self.refinement_prompt_strategy, self.reasoning_effort, self.api_base, self.cardinality, self.verbose)
40
+ self.critic_generator = Generator(self.critic_model, self.critic_prompt_strategy, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
41
+ self.refine_generator = Generator(self.refine_model, self.refinement_prompt_strategy, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
42
42
 
43
43
  def __str__(self):
44
44
  op = super().__str__()
@@ -22,10 +22,11 @@ from palimpzest.query.operators.physical import PhysicalOperator
22
22
 
23
23
 
24
24
  class FilterOp(PhysicalOperator, ABC):
25
- def __init__(self, filter: Filter, *args, **kwargs):
25
+ def __init__(self, filter: Filter, desc: str | None = None, *args, **kwargs):
26
26
  super().__init__(*args, **kwargs)
27
27
  assert self.input_schema == self.output_schema, "Input and output schemas must match for FilterOp"
28
28
  self.filter_obj = filter
29
+ self.desc = desc
29
30
 
30
31
  def __str__(self):
31
32
  op = super().__str__()
@@ -34,11 +35,11 @@ class FilterOp(PhysicalOperator, ABC):
34
35
 
35
36
  def get_id_params(self):
36
37
  id_params = super().get_id_params()
37
- return {"filter": str(self.filter_obj), **id_params}
38
+ return {"filter": str(self.filter_obj), "desc": self.desc, **id_params}
38
39
 
39
40
  def get_op_params(self):
40
41
  op_params = super().get_op_params()
41
- return {"filter": self.filter_obj, **op_params}
42
+ return {"filter": self.filter_obj, "desc": self.desc, **op_params}
42
43
 
43
44
  @abstractmethod
44
45
  def is_image_filter(self) -> bool:
@@ -182,7 +183,7 @@ class LLMFilter(FilterOp):
182
183
  self.model = model
183
184
  self.prompt_strategy = prompt_strategy
184
185
  self.reasoning_effort = reasoning_effort
185
- self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.verbose)
186
+ self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
186
187
 
187
188
  def get_id_params(self):
188
189
  id_params = super().get_id_params()
@@ -21,10 +21,11 @@ from palimpzest.query.operators.physical import PhysicalOperator
21
21
 
22
22
 
23
23
  class JoinOp(PhysicalOperator, ABC):
24
- def __init__(self, condition: str, *args, **kwargs):
24
+ def __init__(self, condition: str, desc: str | None = None, *args, **kwargs):
25
25
  super().__init__(*args, **kwargs)
26
26
  assert self.input_schema == self.output_schema, "Input and output schemas must match for JoinOp"
27
27
  self.condition = condition
28
+ self.desc = desc
28
29
 
29
30
  def __str__(self):
30
31
  op = super().__str__()
@@ -33,11 +34,11 @@ class JoinOp(PhysicalOperator, ABC):
33
34
 
34
35
  def get_id_params(self):
35
36
  id_params = super().get_id_params()
36
- return {"condition": self.condition, **id_params}
37
+ return {"condition": self.condition, "desc": self.desc, **id_params}
37
38
 
38
39
  def get_op_params(self):
39
40
  op_params = super().get_op_params()
40
- return {"condition": self.condition, **op_params}
41
+ return {"condition": self.condition, "desc": self.desc, **op_params}
41
42
 
42
43
  @abstractmethod
43
44
  def is_image_join(self) -> bool:
@@ -64,7 +65,7 @@ class BlockingNestedLoopsJoin(JoinOp):
64
65
  self.prompt_strategy = prompt_strategy
65
66
  self.join_parallelism = join_parallelism
66
67
  self.reasoning_effort = reasoning_effort
67
- self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.verbose)
68
+ self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
68
69
  self.join_idx = 0
69
70
 
70
71
  def get_id_params(self):
@@ -228,7 +229,7 @@ class NestedLoopsJoin(JoinOp):
228
229
  self.prompt_strategy = prompt_strategy
229
230
  self.join_parallelism = join_parallelism
230
231
  self.reasoning_effort = reasoning_effort
231
- self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.verbose)
232
+ self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base, Cardinality.ONE_TO_ONE, self.desc, self.verbose)
232
233
  self.join_idx = 0
233
234
 
234
235
  # maintain list(s) of input records for the join
@@ -256,12 +256,14 @@ class ConvertScan(LogicalOperator):
256
256
  self,
257
257
  cardinality: Cardinality = Cardinality.ONE_TO_ONE,
258
258
  udf: Callable | None = None,
259
+ desc: str | None = None,
259
260
  *args,
260
261
  **kwargs,
261
262
  ):
262
263
  super().__init__(*args, **kwargs)
263
264
  self.cardinality = cardinality
264
265
  self.udf = udf
266
+ self.desc = desc
265
267
 
266
268
  def __str__(self):
267
269
  return f"ConvertScan({self.input_schema} -> {str(self.output_schema)})"
@@ -271,6 +273,7 @@ class ConvertScan(LogicalOperator):
271
273
  logical_id_params = {
272
274
  "cardinality": self.cardinality,
273
275
  "udf": self.udf,
276
+ "desc": self.desc,
274
277
  **logical_id_params,
275
278
  }
276
279
 
@@ -281,6 +284,7 @@ class ConvertScan(LogicalOperator):
281
284
  logical_op_params = {
282
285
  "cardinality": self.cardinality,
283
286
  "udf": self.udf,
287
+ "desc": self.desc,
284
288
  **logical_op_params,
285
289
  }
286
290
 
@@ -327,11 +331,13 @@ class FilteredScan(LogicalOperator):
327
331
  def __init__(
328
332
  self,
329
333
  filter: Filter,
334
+ desc: str | None = None,
330
335
  *args,
331
336
  **kwargs,
332
337
  ):
333
338
  super().__init__(*args, **kwargs)
334
339
  self.filter = filter
340
+ self.desc = desc
335
341
 
336
342
  def __str__(self):
337
343
  return f"FilteredScan({str(self.output_schema)}, {str(self.filter)})"
@@ -340,6 +346,7 @@ class FilteredScan(LogicalOperator):
340
346
  logical_id_params = super().get_logical_id_params()
341
347
  logical_id_params = {
342
348
  "filter": self.filter,
349
+ "desc": self.desc,
343
350
  **logical_id_params,
344
351
  }
345
352
 
@@ -349,6 +356,7 @@ class FilteredScan(LogicalOperator):
349
356
  logical_op_params = super().get_logical_op_params()
350
357
  logical_op_params = {
351
358
  "filter": self.filter,
359
+ "desc": self.desc,
352
360
  **logical_op_params,
353
361
  }
354
362
 
@@ -390,16 +398,17 @@ class GroupByAggregate(LogicalOperator):
390
398
 
391
399
 
392
400
  class JoinOp(LogicalOperator):
393
- def __init__(self, condition: str, *args, **kwargs):
401
+ def __init__(self, condition: str, desc: str | None = None, *args, **kwargs):
394
402
  super().__init__(*args, **kwargs)
395
403
  self.condition = condition
404
+ self.desc = desc
396
405
 
397
406
  def __str__(self):
398
407
  return f"Join(condition={self.condition})"
399
408
 
400
409
  def get_logical_id_params(self) -> dict:
401
410
  logical_id_params = super().get_logical_id_params()
402
- logical_id_params = {"condition": self.condition, **logical_id_params}
411
+ logical_id_params = {"condition": self.condition, "desc": self.desc, **logical_id_params}
403
412
 
404
413
  return logical_id_params
405
414
 
@@ -407,6 +416,7 @@ class JoinOp(LogicalOperator):
407
416
  logical_op_params = super().get_logical_op_params()
408
417
  logical_op_params = {
409
418
  "condition": self.condition,
419
+ "desc": self.desc,
410
420
  **logical_op_params,
411
421
  }
412
422
 
@@ -36,10 +36,10 @@ class MixtureOfAgentsConvert(LLMConvert):
36
36
 
37
37
  # create generators
38
38
  self.proposer_generators = [
39
- Generator(model, self.proposer_prompt_strategy, self.reasoning_effort, self.api_base, self.cardinality, self.verbose)
39
+ Generator(model, self.proposer_prompt_strategy, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
40
40
  for model in proposer_models
41
41
  ]
42
- self.aggregator_generator = Generator(aggregator_model, self.aggregator_prompt_strategy, self.reasoning_effort, self.api_base, self.cardinality, self.verbose)
42
+ self.aggregator_generator = Generator(aggregator_model, self.aggregator_prompt_strategy, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
43
43
 
44
44
  def __str__(self):
45
45
  op = super().__str__()
@@ -22,8 +22,8 @@ class SplitConvert(LLMConvert):
22
22
  super().__init__(*args, **kwargs)
23
23
  self.num_chunks = num_chunks
24
24
  self.min_size_to_chunk = min_size_to_chunk
25
- self.split_generator = Generator(self.model, PromptStrategy.SPLIT_PROPOSER, self.reasoning_effort, self.api_base, self.cardinality, self.verbose)
26
- self.split_merge_generator = Generator(self.model, PromptStrategy.SPLIT_MERGER, self.reasoning_effort, self.api_base, self.cardinality, self.verbose)
25
+ self.split_generator = Generator(self.model, PromptStrategy.SPLIT_PROPOSER, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
26
+ self.split_merge_generator = Generator(self.model, PromptStrategy.SPLIT_MERGER, self.reasoning_effort, self.api_base, self.cardinality, self.desc, self.verbose)
27
27
 
28
28
  # crude adjustment factor for naive estimation in no-sentinel setting
29
29
  self.naive_quality_adjustment = 0.6
@@ -26,6 +26,7 @@ class QueryProcessorConfig(BaseModel):
26
26
  join_parallelism: int = Field(default=64)
27
27
  batch_size: int | None = Field(default=None)
28
28
  reasoning_effort: str | None = Field(default=None) # Gemini: "disable", "low", "medium", "high"
29
+ use_vertex: bool = Field(default=True) # Whether to use Vertex models for Gemini or Google models
29
30
  gemini_credentials_path: str | None = Field(default=None) # Path to Gemini credentials file
30
31
  api_base: str | None = Field(default=None) # API base URL for vLLM
31
32
 
@@ -39,8 +40,8 @@ class QueryProcessorConfig(BaseModel):
39
40
  use_final_op_quality: bool = Field(default=False)
40
41
 
41
42
  # sentinel optimization flags
42
- k: int = Field(default=5)
43
- j: int = Field(default=5)
43
+ k: int = Field(default=6)
44
+ j: int = Field(default=4)
44
45
  sample_budget: int = Field(default=100)
45
46
  seed: int = Field(default=42)
46
47
  exp_name: str | None = Field(default=None)
@@ -114,8 +114,8 @@ class QueryProcessor:
114
114
  execution_stats = ExecutionStats(execution_id=self.execution_id())
115
115
  execution_stats.start()
116
116
 
117
- # if the user provides a train_dataset or validator, we perform optimization
118
- if self.train_dataset is not None or self.validator is not None:
117
+ # if the user provides a validator, we perform optimization
118
+ if self.validator is not None:
119
119
  # create sentinel plan
120
120
  sentinel_plan = self._create_sentinel_plan(self.train_dataset)
121
121
 
@@ -62,13 +62,17 @@ class QueryProcessorFactory:
62
62
  print("WARNING: Both `progress` and `verbose` are set to True, but only one can be True at a time; defaulting to `progress=True`")
63
63
  config.verbose = False
64
64
 
65
+ # if the user provides a training dataset, but no validator, create a default validator
66
+ if train_dataset is not None and validator is None:
67
+ validator = Validator()
68
+ logger.info("No validator provided; using default Validator")
69
+
65
70
  # boolean flag for whether we're performing optimization or not
66
- optimization = train_dataset is not None or validator is not None
67
- val_based_opt = train_dataset is None and validator is not None
71
+ optimization = validator is not None
68
72
 
69
73
  # handle "auto" default for sentinel execution strategies
70
74
  if config.sentinel_execution_strategy == "auto":
71
- config.sentinel_execution_strategy = ("validator" if val_based_opt else "mab") if optimization else None
75
+ config.sentinel_execution_strategy = "mab" if optimization else None
72
76
 
73
77
  # convert the config values for processing, execution, and optimization strategies to enums
74
78
  config = cls._normalize_strategies(config)
@@ -76,7 +80,7 @@ class QueryProcessorFactory:
76
80
  # get available models
77
81
  available_models = getattr(config, 'available_models', [])
78
82
  if available_models is None or len(available_models) == 0:
79
- available_models = get_models(gemini_credentials_path=config.gemini_credentials_path, api_base=config.api_base)
83
+ available_models = get_models(use_vertex=config.use_vertex, gemini_credentials_path=config.gemini_credentials_path, api_base=config.api_base)
80
84
 
81
85
  # remove any models specified in the config
82
86
  remove_models = getattr(config, 'remove_models', [])
@@ -87,7 +91,7 @@ class QueryProcessorFactory:
87
91
  # set the final set of available models in the config
88
92
  config.available_models = available_models
89
93
 
90
- return config
94
+ return config, validator
91
95
 
92
96
  @classmethod
93
97
  def _create_optimizer(cls, config: QueryProcessorConfig) -> Optimizer:
@@ -143,7 +147,7 @@ class QueryProcessorFactory:
143
147
  config = QueryProcessorConfig()
144
148
 
145
149
  # apply any additional keyword arguments to the config and validate its contents
146
- config = cls._config_validation_and_normalization(config, train_dataset, validator)
150
+ config, validator = cls._config_validation_and_normalization(config, train_dataset, validator)
147
151
 
148
152
  # create the optimizer, execution strateg(ies), and processor
149
153
  optimizer = cls._create_optimizer(config)
@@ -3,7 +3,8 @@ import os
3
3
  from palimpzest.constants import Model
4
4
 
5
5
 
6
- def get_models(include_embedding: bool = False, gemini_credentials_path: str | None = None, api_base: str | None = None) -> list[Model]:
6
+ # TODO: better handle vertex vs. google for gemini models
7
+ def get_models(include_embedding: bool = False, use_vertex: bool = True, gemini_credentials_path: str | None = None, api_base: str | None = None) -> list[Model]:
7
8
  """
8
9
  Return the set of models which the system has access to based on the set environment variables.
9
10
  """
@@ -39,11 +40,15 @@ def get_models(include_embedding: bool = False, gemini_credentials_path: str | N
39
40
  )
40
41
  if os.getenv("GEMINI_API_KEY") is not None or os.path.exists(gemini_credentials_path):
41
42
  vertex_models = [model for model in Model if model.is_vertex_model()]
43
+ google_models = [model for model in Model if model.is_google_model()]
42
44
  if not include_embedding:
43
45
  vertex_models = [
44
46
  model for model in vertex_models if not model.is_embedding_model()
45
47
  ]
46
- models.extend(vertex_models)
48
+ if use_vertex:
49
+ models.extend(vertex_models)
50
+ else:
51
+ models.extend(google_models)
47
52
 
48
53
  if api_base is not None:
49
54
  vllm_models = [model for model in Model if model.is_vllm_model()]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.8.0
3
+ Version: 0.8.2
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -1,5 +1,5 @@
1
1
  palimpzest/__init__.py,sha256=1PzadDDOVMQJKNEYUH0_tw8tQKUYTT31M0vuzTr2Rqk,1694
2
- palimpzest/constants.py,sha256=1xGydUfkuVtaeoQ_Ku6P5PDLAelQKAVouivdXkva-zE,21109
2
+ palimpzest/constants.py,sha256=GagsbJl1xCAjgt6Biw27KnHSZgiramxhnerhmYe3P_k,24690
3
3
  palimpzest/policy.py,sha256=lIvw_C_rmwCH4LZaeNkAuixl8zw9RAW_JcSWSHPjKyc,11628
4
4
  palimpzest/agents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  palimpzest/agents/compute_agents.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,7 +9,7 @@ palimpzest/core/models.py,sha256=fLO4T7x0njNeEbUpbhJm9cdnBva0y0Zw5WGBGdzdS_I,424
9
9
  palimpzest/core/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  palimpzest/core/data/context.py,sha256=x1xYyu9qW65dvtK_XayIfv_CgsCEPW6Qe0DTiSf9sjU,16207
11
11
  palimpzest/core/data/context_manager.py,sha256=8hAKWD2jhFZgghTu7AYgjkvKDsJUPVxq8g4nG0HWvfo,6150
12
- palimpzest/core/data/dataset.py,sha256=vqEEMxaG157jdyzUxM_tLt5Xq_49Yq-0dVGhS0ZUiHA,27904
12
+ palimpzest/core/data/dataset.py,sha256=M7SxPXzHsfj-ljy_P3ckcJNqGf4RwNxtZI02q_tmL2M,28178
13
13
  palimpzest/core/data/index_dataset.py,sha256=adO67DgzHhA4lBME0-h4SjXfdz9UcNMSDGXTpUdKbgE,1929
14
14
  palimpzest/core/data/iter_dataset.py,sha256=u7eZNWWT84rH_D8LNIuq0NAnm2roX81ifKTYp-hwY7g,20512
15
15
  palimpzest/core/elements/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -21,45 +21,45 @@ palimpzest/core/lib/schemas.py,sha256=0qauaG3uW5tCJXNAo1i0G0UgbTaQLSLT6GoNDX8494
21
21
  palimpzest/prompts/__init__.py,sha256=sdZbC8RWi_IGjFuzKQMdRjS2Ih4zQnkyzFoJ6Q3Ce70,1764
22
22
  palimpzest/prompts/agent_prompts.py,sha256=CUzBVLBiPSw8OShtKp4VTpQwtrNMtcMglo-IZHMvuDM,17459
23
23
  palimpzest/prompts/context_search.py,sha256=s3pti4XNRiIyiWzjVNL_NqmqEc31jzSKMF2SlN0Aaf8,357
24
- palimpzest/prompts/convert_prompts.py,sha256=FR_zUADuOWxMqZED4S0lyO9VNgKPNiVpSZv6ND7a0v4,6009
24
+ palimpzest/prompts/convert_prompts.py,sha256=quoIcdIrP4FoPBXlHKeOPjH5lVn8GH30f1tPiu3Xsyw,6037
25
25
  palimpzest/prompts/critique_and_refine_convert_prompts.py,sha256=WoXExBxQ7twswd9VCCST26c-2ehZtpD2iQoBi7sqDnQ,7814
26
- palimpzest/prompts/filter_prompts.py,sha256=lYQFrpAKhOMUQDOVbRBHh7IjuUNMCmBnAqHwDuptQHI,4232
27
- palimpzest/prompts/join_prompts.py,sha256=viQVvOpa2l9PYM34ua_jPNZnUOU_eCTMIoabBkF5cVc,5929
26
+ palimpzest/prompts/filter_prompts.py,sha256=drTivlA_WnWkAIzY9GjqO_hfwdy432nMtsV-OYa-mlE,4260
27
+ palimpzest/prompts/join_prompts.py,sha256=fEGZY_zn_dvOJCeUFYrHdg5P3h_H6Fo3FMmdnXwp2l4,5957
28
28
  palimpzest/prompts/moa_aggregator_convert_prompts.py,sha256=BQRrtGdr53PTqvXzmFh8kfQ_w9KoKw-zTtmdo-8RFjo,2887
29
- palimpzest/prompts/moa_proposer_convert_prompts.py,sha256=35pxtR2hnjLkv_10VEetRR9qUCR-zD85NZF3BaAANDk,3462
30
- palimpzest/prompts/prompt_factory.py,sha256=FDBoVdJ_khT7t6T6WAiK6RgC7HqB3efmRkwMam3AIhM,51262
29
+ palimpzest/prompts/moa_proposer_convert_prompts.py,sha256=8vhq0bnikbCzS4CDV5IskFPWF0TC7VZGjeGvpOyIBV8,3476
30
+ palimpzest/prompts/prompt_factory.py,sha256=MpEYoyPXY3gfFGG60O9rlw5A5UejC3CTRHcO6KRhyww,51733
31
31
  palimpzest/prompts/split_merge_prompts.py,sha256=0mTZeJhxtvlmv-ro0KwQpxlGgSTwyUhGRHJ-uHk2Zlw,3146
32
- palimpzest/prompts/split_proposer_prompts.py,sha256=TBHLGaM_ycHjGHrp1JziJoJDw4S5_F4afKSAdt2McKk,2624
33
- palimpzest/prompts/util_phrases.py,sha256=NWrcHfjJyiOY16Jyt7R50moVnlJDyvSBZ9kBqyX2WQo,751
32
+ palimpzest/prompts/split_proposer_prompts.py,sha256=X3hufHPAiQyytZ_TFe2wJkVPgJtClZ9fVgz2zNk2Z5Q,2638
33
+ palimpzest/prompts/util_phrases.py,sha256=ajxzj-B2gE56IENKVKElqw1xKWOF5IahOOqq026Pr00,876
34
34
  palimpzest/prompts/validator.py,sha256=pJTZjlt_OiFM3IFOgsJ0jQdayra8iRVrpqENlXI9tQQ,10532
35
35
  palimpzest/query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  palimpzest/query/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
37
  palimpzest/query/execution/all_sample_execution_strategy.py,sha256=3n2hl8m-WFWIu-a8DiSVsGkz4ej3yB7mSdFR0jsiwAU,14366
38
38
  palimpzest/query/execution/execution_strategy.py,sha256=KwBJbWOBOOPBiWRm3ypHcAQiWbCsvtW6UnVU4tHkYz8,18905
39
39
  palimpzest/query/execution/execution_strategy_type.py,sha256=vRQBPCQN5_aoyD3TLIeW3VPo15mqF-5RBvEXkENz9FE,987
40
- palimpzest/query/execution/mab_execution_strategy.py,sha256=LY1JlbYMsnJHCtYjaJ6iklojBqXc2B4KS62lobPFNz0,42341
40
+ palimpzest/query/execution/mab_execution_strategy.py,sha256=paVfB8lqNyUuISqfhkTd6RqOZqpyVty1EAN1sZz7erA,43554
41
41
  palimpzest/query/execution/parallel_execution_strategy.py,sha256=Gn5hB5XddX2jCkxx6d7O-DmitK6fbuwBFnnyKhnGYEw,15706
42
42
  palimpzest/query/execution/single_threaded_execution_strategy.py,sha256=1eo-Z9G3u92_PjoSX8HmO3D3phYgA8f0Actbgd1-oKY,16247
43
43
  palimpzest/query/generators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
- palimpzest/query/generators/generators.py,sha256=pi6gTCzQYs-z93IFNGKyoskIcdYCSnOwyaj-DvSlkb4,20877
44
+ palimpzest/query/generators/generators.py,sha256=WYMcff7axgDEmYqXvy1A-C5FA4s6lI393CcCx-TKTgM,20941
45
45
  palimpzest/query/operators/__init__.py,sha256=j-yh0P5tzXGa0JU_g8aNn54wCJDXPCMbmtOmazXXEts,3459
46
46
  palimpzest/query/operators/aggregate.py,sha256=QvWr4C1arFSZWVqPSF5F5On6Ise5OF3VVWNGSq6Gfjk,11230
47
47
  palimpzest/query/operators/compute.py,sha256=bxMKLRU_o7v603daKeR0FayDZ_V6NLI1fGzgu6E-sac,8473
48
- palimpzest/query/operators/convert.py,sha256=teesuAeYl20ULwm6LIA277SZremdHedD2N2GYDUjb5E,17156
49
- palimpzest/query/operators/critique_and_refine_convert.py,sha256=nJOQf7RLJR5Acg7fPssb0tTmtsCipG8hHu9PRquM9RE,5271
48
+ palimpzest/query/operators/convert.py,sha256=mpXXYdgcH6zPqypzh3SjTqHjq7PDEbkfdZ1_XEml-nw,17334
49
+ palimpzest/query/operators/critique_and_refine_convert.py,sha256=PbtKva6e3fh3yeUMGlkcpacPD003bFBzgBsw_yy-8fw,5293
50
50
  palimpzest/query/operators/distinct.py,sha256=MuF3NlC0QMTSGs0_fe2oly0I5Ow0hfOa7h8BFGhHiCs,2594
51
- palimpzest/query/operators/filter.py,sha256=Wm1PaxURE1ZY5j7E1AitGdJfb_IKJoC_3qQW8aF0XC4,10703
52
- palimpzest/query/operators/join.py,sha256=z1bzhdazTEq1BjoUSwV6j_DQ84TJ3uaSZJpCzSP61nc,17727
51
+ palimpzest/query/operators/filter.py,sha256=jmSGV7xZ8uxXzH-Oko7l8ZPZxNf_qJNkYVAYgiSHl9g,10802
52
+ palimpzest/query/operators/join.py,sha256=rs9_Y59082dlnSJu9rpRDEuv7jDPItKSpYsC8FCMFDM,17837
53
53
  palimpzest/query/operators/limit.py,sha256=upJ775cGkxjFHRJm8GpSvtJN1cspg2FVYLN_MrIfUo4,2113
54
- palimpzest/query/operators/logical.py,sha256=rh3XBUVO1JAEijw9AHjU35uf5ag01-KONdpCHJXRs3M,19883
55
- palimpzest/query/operators/mixture_of_agents_convert.py,sha256=Y6O9-zL_6BPwl5Yix3SyYhI_68wiejOtJ3xuFcn_dbs,6731
54
+ palimpzest/query/operators/logical.py,sha256=K_dRlNKkda35kQ7gYGsrW9PoFuDPzexpjtDq_FYdhVw,20223
55
+ palimpzest/query/operators/mixture_of_agents_convert.py,sha256=4v2V612NqdVD0RmcJ5VSgTiVliObku-t-A79SXVnpk0,6753
56
56
  palimpzest/query/operators/physical.py,sha256=buPZjtP4HKNVfOCNWdBtDnRS217dSsIG74gqZ1jmoyo,8320
57
57
  palimpzest/query/operators/project.py,sha256=RX5SbHFRwHcMfiQRofIPQr-AHgIDYm68ifiFZAPu7Fo,2094
58
58
  palimpzest/query/operators/rag_convert.py,sha256=1QQGrE22-Ec3-MNbnaU3k4TGHdpi2qZqZR9MHUniEM4,10691
59
59
  palimpzest/query/operators/retrieve.py,sha256=v1FTFsSctqH4B37aWgBXYIxgOMJwRWQ2kwwXu1huwaQ,13106
60
60
  palimpzest/query/operators/scan.py,sha256=Da_EZUrArzlAameHYCmtqo-xbPOFvbTYSktrUcUEUSc,7398
61
61
  palimpzest/query/operators/search.py,sha256=xydO5Kni0RArpvLSn2ajzD4TcH442VjpP2x9NakjzaA,22842
62
- palimpzest/query/operators/split_convert.py,sha256=SgtkwGWnIFlQTk96NsgckRx5q15KaGpsF3Si0FzHEGo,7765
62
+ palimpzest/query/operators/split_convert.py,sha256=acCPlkrUfqHhGD7bU2AXQAhIEeAEIh0itamuCOm4KBk,7787
63
63
  palimpzest/query/optimizer/__init__.py,sha256=L2E1rOA-8O9oH6JL56wLI1qUVxXBLubJEG1IHMH-HU4,2384
64
64
  palimpzest/query/optimizer/cost_model.py,sha256=OldPy-TJdfsQbYRoKlb3yWeKbi15jcldTIUS6BTi9T8,12678
65
65
  palimpzest/query/optimizer/optimizer.py,sha256=mgM6c0d_voGNun2hMzqjfumJVieACtcHsNnBP4LyXAA,19626
@@ -70,9 +70,9 @@ palimpzest/query/optimizer/primitives.py,sha256=jMMVq37y1tWiPU1lSSKQP9OP-mzkpSxS
70
70
  palimpzest/query/optimizer/rules.py,sha256=9AsuVjhiZUc0snQPNhIqeyKpmqFsSv7e-v6BEbp9CDw,43315
71
71
  palimpzest/query/optimizer/tasks.py,sha256=DJcKDNbVJox61rnTW0HgT1PtxGx2P_NiLvNroXie-Lg,29509
72
72
  palimpzest/query/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- palimpzest/query/processor/config.py,sha256=b_EQOqOXoRP6AziOw1iLqb8tlSWP-D1_el3mmrnBDAk,2263
74
- palimpzest/query/processor/query_processor.py,sha256=W01-2FocN1Jsv58gmEo5ALTIcpLt7D0dmI8kghSCdBk,6291
75
- palimpzest/query/processor/query_processor_factory.py,sha256=H_2pkcN_aVbNDuMLsvZP2PXARLF9MwoHGAzEWkSNNYM,7866
73
+ palimpzest/query/processor/config.py,sha256=vHVsgeBnKigacO0QA7bLf5q8pJhFWA2j9-p_no2bmYo,2366
74
+ palimpzest/query/processor/query_processor.py,sha256=T4ffPbnOX23G8FDITzmM7Iw7DUEDWIHnwl8XLYllgjg,6240
75
+ palimpzest/query/processor/query_processor_factory.py,sha256=6w9R1Y8AOV22X8MUf7g2G5Qb15BGEZAXQKbCQJafWJ0,8048
76
76
  palimpzest/schemabuilder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
77
  palimpzest/schemabuilder/schema_builder.py,sha256=QraGp66dcD-ej6Y2mER40o86G9JqlBkL7swkJzjUAIY,7968
78
78
  palimpzest/tools/README.md,sha256=56_6LPG80uc0CLVhTBP6I1wgIffNv9cyTr0TmVZqmrM,483
@@ -83,13 +83,13 @@ palimpzest/tools/skema_tools.py,sha256=HXUFpjMhbVxZwKKkATeK-FwtlTCawaCbeP-uHntI1
83
83
  palimpzest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
84
  palimpzest/utils/env_helpers.py,sha256=n81KzoJ459pRxo7QmJA7duazwWsfoMGTHc71D2LatFk,334
85
85
  palimpzest/utils/hash_helpers.py,sha256=3A8dA7SbXTwnnvZvPVNqqMLlVRhCKyKF_bjNNAu3Exk,334
86
- palimpzest/utils/model_helpers.py,sha256=Vlu3KIvbc4Usg4iSI2KMFSc-qcdAubWN2CSjZod2czY,2233
86
+ palimpzest/utils/model_helpers.py,sha256=X6SlMgD5I5Aj_cxaFaoGaaNvOOqTNZVmjj6zbfn63Yk,2476
87
87
  palimpzest/utils/progress.py,sha256=7gucyZr82udMDZitrrkAOSKHZVljE3R2wv9nf5gA5TM,20807
88
88
  palimpzest/utils/udfs.py,sha256=LjHic54B1az-rKgNLur0wOpaz2ko_UodjLEJrazkxvY,1854
89
89
  palimpzest/validator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
90
  palimpzest/validator/validator.py,sha256=J2tGvJqfg6v5lOQDYYaqAa9d37uVHBrqkNs-a8d1Ic0,16365
91
- palimpzest-0.8.0.dist-info/licenses/LICENSE,sha256=5GUlHy9lr-Py9kvV38FF1m3yy3NqM18fefuE9wkWumo,1079
92
- palimpzest-0.8.0.dist-info/METADATA,sha256=MUkUorsKFMVGPmCeAZOBruvKP8shJ1kbF5kulxPnSHc,7286
93
- palimpzest-0.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
- palimpzest-0.8.0.dist-info/top_level.txt,sha256=raV06dJUgohefUn3ZyJS2uqp_Y76EOLA9Y2e_fxt8Ew,11
95
- palimpzest-0.8.0.dist-info/RECORD,,
91
+ palimpzest-0.8.2.dist-info/licenses/LICENSE,sha256=5GUlHy9lr-Py9kvV38FF1m3yy3NqM18fefuE9wkWumo,1079
92
+ palimpzest-0.8.2.dist-info/METADATA,sha256=bDa2zFfJr_v4Ef6fzq3SCALSoXoXc0uPnefnmVbAzTA,7286
93
+ palimpzest-0.8.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
94
+ palimpzest-0.8.2.dist-info/top_level.txt,sha256=raV06dJUgohefUn3ZyJS2uqp_Y76EOLA9Y2e_fxt8Ew,11
95
+ palimpzest-0.8.2.dist-info/RECORD,,