palimpzest 0.8.2__tar.gz → 0.8.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {palimpzest-0.8.2/src/palimpzest.egg-info → palimpzest-0.8.4}/PKG-INFO +3 -8
  2. {palimpzest-0.8.2 → palimpzest-0.8.4}/pyproject.toml +3 -9
  3. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/constants.py +38 -62
  4. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/data/iter_dataset.py +5 -5
  5. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/elements/groupbysig.py +1 -1
  6. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/elements/records.py +91 -109
  7. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/lib/schemas.py +23 -0
  8. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/models.py +3 -3
  9. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/prompts/__init__.py +2 -6
  10. palimpzest-0.8.4/src/palimpzest/prompts/convert_prompts.py +87 -0
  11. palimpzest-0.8.4/src/palimpzest/prompts/critique_and_refine_prompts.py +66 -0
  12. palimpzest-0.8.4/src/palimpzest/prompts/filter_prompts.py +76 -0
  13. palimpzest-0.8.4/src/palimpzest/prompts/join_prompts.py +100 -0
  14. palimpzest-0.8.2/src/palimpzest/prompts/moa_aggregator_convert_prompts.py → palimpzest-0.8.4/src/palimpzest/prompts/moa_aggregator_prompts.py +51 -2
  15. palimpzest-0.8.4/src/palimpzest/prompts/moa_proposer_prompts.py +87 -0
  16. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/prompts/prompt_factory.py +351 -479
  17. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/prompts/split_merge_prompts.py +51 -2
  18. palimpzest-0.8.4/src/palimpzest/prompts/split_proposer_prompts.py +87 -0
  19. palimpzest-0.8.4/src/palimpzest/prompts/utils.py +109 -0
  20. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
  21. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/execution/execution_strategy.py +4 -4
  22. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/execution/mab_execution_strategy.py +1 -2
  23. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/execution/parallel_execution_strategy.py +3 -3
  24. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/execution/single_threaded_execution_strategy.py +8 -8
  25. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/generators/generators.py +31 -17
  26. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/__init__.py +15 -2
  27. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/aggregate.py +21 -19
  28. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/compute.py +6 -8
  29. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/convert.py +12 -37
  30. palimpzest-0.8.4/src/palimpzest/query/operators/critique_and_refine.py +194 -0
  31. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/distinct.py +7 -7
  32. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/filter.py +13 -25
  33. palimpzest-0.8.4/src/palimpzest/query/operators/join.py +532 -0
  34. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/limit.py +4 -4
  35. palimpzest-0.8.4/src/palimpzest/query/operators/mixture_of_agents.py +246 -0
  36. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/physical.py +25 -2
  37. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/project.py +4 -4
  38. palimpzest-0.8.2/src/palimpzest/query/operators/rag_convert.py → palimpzest-0.8.4/src/palimpzest/query/operators/rag.py +202 -5
  39. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/retrieve.py +10 -9
  40. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/scan.py +9 -10
  41. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/search.py +18 -24
  42. palimpzest-0.8.4/src/palimpzest/query/operators/split.py +321 -0
  43. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/optimizer/__init__.py +12 -8
  44. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/optimizer/optimizer.py +12 -10
  45. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/optimizer/rules.py +201 -108
  46. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/optimizer/tasks.py +18 -6
  47. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/validator/validator.py +7 -9
  48. {palimpzest-0.8.2 → palimpzest-0.8.4/src/palimpzest.egg-info}/PKG-INFO +3 -8
  49. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest.egg-info/SOURCES.txt +8 -8
  50. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest.egg-info/requires.txt +1 -7
  51. palimpzest-0.8.2/src/palimpzest/prompts/convert_prompts.py +0 -143
  52. palimpzest-0.8.2/src/palimpzest/prompts/critique_and_refine_convert_prompts.py +0 -216
  53. palimpzest-0.8.2/src/palimpzest/prompts/filter_prompts.py +0 -114
  54. palimpzest-0.8.2/src/palimpzest/prompts/join_prompts.py +0 -163
  55. palimpzest-0.8.2/src/palimpzest/prompts/moa_proposer_convert_prompts.py +0 -75
  56. palimpzest-0.8.2/src/palimpzest/prompts/split_proposer_prompts.py +0 -55
  57. palimpzest-0.8.2/src/palimpzest/prompts/util_phrases.py +0 -19
  58. palimpzest-0.8.2/src/palimpzest/query/operators/critique_and_refine_convert.py +0 -113
  59. palimpzest-0.8.2/src/palimpzest/query/operators/join.py +0 -403
  60. palimpzest-0.8.2/src/palimpzest/query/operators/mixture_of_agents_convert.py +0 -140
  61. palimpzest-0.8.2/src/palimpzest/query/operators/split_convert.py +0 -170
  62. {palimpzest-0.8.2 → palimpzest-0.8.4}/LICENSE +0 -0
  63. {palimpzest-0.8.2 → palimpzest-0.8.4}/README.md +0 -0
  64. {palimpzest-0.8.2 → palimpzest-0.8.4}/setup.cfg +0 -0
  65. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/__init__.py +0 -0
  66. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/agents/__init__.py +0 -0
  67. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/agents/compute_agents.py +0 -0
  68. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/agents/search_agents.py +0 -0
  69. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/__init__.py +0 -0
  70. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/data/__init__.py +0 -0
  71. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/data/context.py +0 -0
  72. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/data/context_manager.py +0 -0
  73. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/data/dataset.py +0 -0
  74. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/data/index_dataset.py +0 -0
  75. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/elements/__init__.py +0 -0
  76. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/elements/filters.py +0 -0
  77. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/core/lib/__init__.py +0 -0
  78. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/policy.py +0 -0
  79. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/prompts/agent_prompts.py +0 -0
  80. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/prompts/context_search.py +0 -0
  81. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/prompts/validator.py +0 -0
  82. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/__init__.py +0 -0
  83. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/execution/__init__.py +0 -0
  84. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/execution/execution_strategy_type.py +0 -0
  85. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/generators/__init__.py +0 -0
  86. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/operators/logical.py +0 -0
  87. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/optimizer/cost_model.py +0 -0
  88. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/optimizer/optimizer_strategy.py +0 -0
  89. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/optimizer/optimizer_strategy_type.py +0 -0
  90. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/optimizer/plan.py +0 -0
  91. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/optimizer/primitives.py +0 -0
  92. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/processor/__init__.py +0 -0
  93. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/processor/config.py +0 -0
  94. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/processor/query_processor.py +0 -0
  95. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/query/processor/query_processor_factory.py +0 -0
  96. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/schemabuilder/__init__.py +0 -0
  97. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/schemabuilder/schema_builder.py +0 -0
  98. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/tools/README.md +0 -0
  99. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/tools/__init__.py +0 -0
  100. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/tools/allenpdf.py +0 -0
  101. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/tools/pdfparser.py +0 -0
  102. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/tools/skema_tools.py +0 -0
  103. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/utils/__init__.py +0 -0
  104. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/utils/env_helpers.py +0 -0
  105. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/utils/hash_helpers.py +0 -0
  106. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/utils/model_helpers.py +0 -0
  107. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/utils/progress.py +0 -0
  108. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/utils/udfs.py +0 -0
  109. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest/validator/__init__.py +0 -0
  110. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest.egg-info/dependency_links.txt +0 -0
  111. {palimpzest-0.8.2 → palimpzest-0.8.4}/src/palimpzest.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.8.2
3
+ Version: 0.8.4
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: MIT License
13
13
  Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Programming Language :: Python :: 3.8
15
- Requires-Python: >=3.8
15
+ Requires-Python: >=3.10
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: anthropic>=0.55.0
@@ -22,7 +22,7 @@ Requires-Dist: colorama>=0.4.6
22
22
  Requires-Dist: datasets>=4.0.0
23
23
  Requires-Dist: fastapi~=0.115.0
24
24
  Requires-Dist: gradio>=5.26.0
25
- Requires-Dist: litellm>=1.73.1
25
+ Requires-Dist: litellm>=1.76.1
26
26
  Requires-Dist: numpy==2.0.2
27
27
  Requires-Dist: openai>=1.0
28
28
  Requires-Dist: pandas>=2.1.1
@@ -44,11 +44,6 @@ Requires-Dist: tabulate>=0.9.0
44
44
  Requires-Dist: together>=1.5.5
45
45
  Requires-Dist: tqdm~=4.66.1
46
46
  Requires-Dist: rich[jupyter]>=13.9.2
47
- Provides-Extra: docs
48
- Requires-Dist: mkdocs>=1.6.1; extra == "docs"
49
- Requires-Dist: mkdocs-material>=9.6.3; extra == "docs"
50
- Requires-Dist: mkdocstrings-python>=1.15.0; extra == "docs"
51
- Requires-Dist: mkdocs-material[imaging]; extra == "docs"
52
47
  Provides-Extra: vllm
53
48
  Requires-Dist: vllm>=0.10.1.1; extra == "vllm"
54
49
  Dynamic: license-file
@@ -1,9 +1,9 @@
1
1
  [project]
2
2
  name = "palimpzest"
3
- version = "0.8.2"
3
+ version = "0.8.4"
4
4
  description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
5
5
  readme = "README.md"
6
- requires-python = ">=3.8"
6
+ requires-python = ">=3.10"
7
7
  keywords = ["relational", "optimization", "llm", "AI programming", "extraction", "tools", "document", "search", "integration"]
8
8
  authors = [
9
9
  {name="MIT DSG Semantic Management Lab", email="michjc@csail.mit.edu"},
@@ -16,7 +16,7 @@ dependencies = [
16
16
  "datasets>=4.0.0",
17
17
  "fastapi~=0.115.0",
18
18
  "gradio>=5.26.0",
19
- "litellm>=1.73.1",
19
+ "litellm>=1.76.1",
20
20
  "numpy==2.0.2",
21
21
  "openai>=1.0",
22
22
  "pandas>=2.1.1",
@@ -49,12 +49,6 @@ classifiers=[
49
49
  ]
50
50
 
51
51
  [project.optional-dependencies]
52
- docs = [
53
- "mkdocs>=1.6.1",
54
- "mkdocs-material>=9.6.3",
55
- "mkdocstrings-python>=1.15.0",
56
- "mkdocs-material[imaging]",
57
- ]
58
52
  vllm = [
59
53
  "vllm>=0.10.1.1",
60
54
  ]
@@ -25,8 +25,6 @@ class Model(str, Enum):
25
25
  GPT_5_MINI = "openai/gpt-5-mini-2025-08-07"
26
26
  GPT_5_NANO = "openai/gpt-5-nano-2025-08-07"
27
27
  o4_MINI = "openai/o4-mini-2025-04-16" # noqa: N815
28
- TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
29
- CLIP_VIT_B_32 = "clip-ViT-B-32"
30
28
  CLAUDE_3_5_SONNET = "anthropic/claude-3-5-sonnet-20241022"
31
29
  CLAUDE_3_7_SONNET = "anthropic/claude-3-7-sonnet-20250219"
32
30
  CLAUDE_3_5_HAIKU = "anthropic/claude-3-5-haiku-20241022"
@@ -41,6 +39,8 @@ class Model(str, Enum):
41
39
  GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
42
40
  VLLM_QWEN_1_5_0_5B_CHAT = "hosted_vllm/qwen/Qwen1.5-0.5B-Chat"
43
41
  # o1 = "o1-2024-12-17"
42
+ TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
43
+ CLIP_VIT_B_32 = "clip-ViT-B-32"
44
44
 
45
45
  def __repr__(self):
46
46
  return f"{self.name}"
@@ -136,69 +136,38 @@ class PromptStrategy(str, Enum):
136
136
  performing some task with a specified Model.
137
137
  """
138
138
 
139
- # Chain-of-Thought Boolean Prompt Strategies
140
- COT_BOOL = "chain-of-thought-bool"
141
- COT_BOOL_NO_REASONING = "chain-of-thought-bool-no-reasoning"
142
- # COT_BOOL_CRITIC = "chain-of-thought-bool-critic"
143
- # COT_BOOL_REFINE = "chain-of-thought-bool-refine"
144
-
145
- # Chain-of-Thought Boolean with Image Prompt Strategies
146
- COT_BOOL_IMAGE = "chain-of-thought-bool-image"
147
- COT_BOOL_IMAGE_NO_REASONING = "chain-of-thought-bool-image"
148
- COT_BOOL_AUDIO = "chain-of-thought-bool-audio"
149
- COT_BOOL_AUDIO_NO_REASONING = "chain-of-thought-bool-audio"
150
- # COT_BOOL_IMAGE_CRITIC = "chain-of-thought-bool-image-critic"
151
- # COT_BOOL_IMAGE_REFINE = "chain-of-thought-bool-image-refine"
152
-
153
- # Chain-of-Thought Join Prompt Strategies
154
- COT_JOIN = "chain-of-thought-join"
155
- COT_JOIN_NO_REASONING = "chain-of-thought-join-no-reasoning"
156
- COT_JOIN_IMAGE = "chain-of-thought-join-image"
157
- COT_JOIN_IMAGE_NO_REASONING = "chain-of-thought-join-image-no-reasoning"
158
- COT_JOIN_AUDIO = "chain-of-thought-join-audio"
159
- COT_JOIN_AUDIO_NO_REASONING = "chain-of-thought-join-audio-no-reasoning"
160
-
161
- # Chain-of-Thought Question Answering Prompt Strategies
162
- COT_QA = "chain-of-thought-question"
163
- COT_QA_NO_REASONING = "chain-of-thought-question-no-reasoning"
164
- COT_QA_CRITIC = "chain-of-thought-question-critic"
165
- COT_QA_REFINE = "chain-of-thought-question-refine"
166
-
167
- # Chain-of-Thought Question with Image Prompt Strategies
168
- COT_QA_IMAGE = "chain-of-thought-question-image"
169
- COT_QA_IMAGE_NO_REASONING = "chain-of-thought-question-image-no-reasoning"
170
- COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
171
- COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
172
-
173
- # Chain-of-Thought Queestion with Audio Prompt Strategies
174
- COT_QA_AUDIO = "chain-of-thought-question-audio"
175
- COT_QA_AUDIO_NO_REASONING = "chain-of-thought-question-audio-no-reasoning"
176
- # TODO: COT_QA_AUDIO_CRITIC/REFINE
177
-
178
- # Mixture-of-Agents Prompt Strategies
179
- COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
180
- COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
181
- COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
182
- # TODO: COT_MOA_PROPOSER_AUDIO
183
-
184
- # Split Convert Prompt Strategies
185
- SPLIT_PROPOSER = "split-proposer"
186
- SPLIT_MERGER = "split-merger"
187
-
188
- def is_image_prompt(self):
189
- return "image" in self.value
190
-
191
- def is_audio_prompt(self):
192
- return "audio" in self.value
193
-
194
- def is_bool_prompt(self):
195
- return "bool" in self.value
139
+ # filter prompt strategies
140
+ FILTER = "filter"
141
+ FILTER_NO_REASONING = "filter-no-reasoning"
142
+ FILTER_CRITIC = "filter-critic"
143
+ FILTER_REFINE = "filter-refine"
144
+ FILTER_MOA_PROPOSER = "filter-mixture-of-agents-proposer"
145
+ FILTER_MOA_AGG = "filter-mixture-of-agents-aggregation"
146
+ FILTER_SPLIT_PROPOSER = "filter-split-proposer"
147
+ FILTER_SPLIT_MERGER = "filter-split-merger"
148
+
149
+ # join prompt strategies
150
+ JOIN = "join"
151
+ JOIN_NO_REASONING = "join-no-reasoning"
152
+
153
+ # map prompt strategies
154
+ MAP = "map"
155
+ MAP_NO_REASONING = "map-no-reasoning"
156
+ MAP_CRITIC = "map-critic"
157
+ MAP_REFINE = "map-refine"
158
+ MAP_MOA_PROPOSER = "map-mixture-of-agents-proposer"
159
+ MAP_MOA_AGG = "map-mixture-of-agents-aggregation"
160
+ MAP_SPLIT_PROPOSER = "map-split-proposer"
161
+ MAP_SPLIT_MERGER = "map-split-merger"
162
+
163
+ def is_filter_prompt(self):
164
+ return "filter" in self.value
196
165
 
197
166
  def is_join_prompt(self):
198
167
  return "join" in self.value
199
168
 
200
- def is_convert_prompt(self):
201
- return "bool" not in self.value and "join" not in self.value
169
+ def is_map_prompt(self):
170
+ return "map" in self.value
202
171
 
203
172
  def is_critic_prompt(self):
204
173
  return "critic" in self.value
@@ -221,6 +190,13 @@ class PromptStrategy(str, Enum):
221
190
  def is_no_reasoning_prompt(self):
222
191
  return "no-reasoning" in self.value
223
192
 
193
+
194
+ class Modality(str, Enum):
195
+ TEXT = "text"
196
+ IMAGE = "image"
197
+ AUDIO = "audio"
198
+
199
+
224
200
  class AggFunc(str, Enum):
225
201
  COUNT = "count"
226
202
  AVERAGE = "average"
@@ -527,7 +503,7 @@ CLIP_VIT_B_32_MODEL_CARD = {
527
503
  ##### Time #####
528
504
  "seconds_per_output_token": 0.0098, # NOTE: just copying TEXT_EMBEDDING_3_SMALL_MODEL_CARD for now
529
505
  ##### Agg. Benchmark #####
530
- "overall": 63.3, # NOTE: ImageNet top-1 accuracy
506
+ "overall": 63.3, # NOTE: imageNet top-1 accuracy
531
507
  }
532
508
  CLAUDE_3_5_SONNET_MODEL_CARD = {
533
509
  ##### Cost in USD #####
@@ -227,7 +227,7 @@ class HTMLFileDataset(BaseFileDataset):
227
227
  path (str): The path to the directory
228
228
  """
229
229
  super().__init__(path=path, id=id, schema=WebPage)
230
- assert all([filename.endswith(tuple(constants.HTML_EXTENSIONS)) for filename in self.filepaths])
230
+ self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.HTML_EXTENSIONS))]
231
231
 
232
232
  def _html_to_text_with_links(self, html: str) -> str:
233
233
  # Parse the HTML content
@@ -295,7 +295,7 @@ class ImageFileDataset(BaseFileDataset):
295
295
  path (str): The path to the directory
296
296
  """
297
297
  super().__init__(path=path, id=id, schema=ImageFile)
298
- assert all([filename.endswith(tuple(constants.IMAGE_EXTENSIONS)) for filename in self.filepaths])
298
+ self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.IMAGE_EXTENSIONS))]
299
299
 
300
300
  def __getitem__(self, idx: int) -> dict:
301
301
  """
@@ -347,7 +347,7 @@ class PDFFileDataset(BaseFileDataset):
347
347
  file_cache_dir (str): The directory to store the temporary files generated during PDF processing
348
348
  """
349
349
  super().__init__(path=path, id=id, schema=PDFFile)
350
- assert all([filename.endswith(tuple(constants.PDF_EXTENSIONS)) for filename in self.filepaths])
350
+ self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.PDF_EXTENSIONS))]
351
351
  self.pdfprocessor = pdfprocessor
352
352
  self.file_cache_dir = file_cache_dir
353
353
 
@@ -432,7 +432,7 @@ class XLSFileDataset(BaseFileDataset):
432
432
  Constructor for the `XLSFileDataset` class. The `schema` is set to the `XLSFile` schema.
433
433
  """
434
434
  super().__init__(path=path, id=id, schema=XLSFile)
435
- assert all([filename.endswith(tuple(constants.XLS_EXTENSIONS)) for filename in self.filepaths])
435
+ self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.XLS_EXTENSIONS))]
436
436
 
437
437
  def __getitem__(self, idx: int) -> dict:
438
438
  """
@@ -483,7 +483,7 @@ class AudioFileDataset(BaseFileDirectoryDataset):
483
483
  path (str): The path to the directory
484
484
  """
485
485
  super().__init__(path=path, id=id, schema=AudioFile)
486
- assert all([filename.endswith(tuple(constants.AUDIO_EXTENSIONS)) for filename in self.filepaths])
486
+ self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.AUDIO_EXTENSIONS))]
487
487
 
488
488
  def __getitem__(self, idx: int) -> dict:
489
489
  """
@@ -16,7 +16,7 @@ class GroupBySig:
16
16
  self.agg_funcs = agg_funcs
17
17
  self.agg_fields = agg_fields
18
18
 
19
- def validate_schema(self, input_schema: BaseModel) -> tuple[bool, str | None]:
19
+ def validate_schema(self, input_schema: type[BaseModel]) -> tuple[bool, str | None]:
20
20
  for f in self.group_by_fields:
21
21
  if f not in input_schema.model_fields:
22
22
  return (False, "Supplied schema has no field " + f)
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  from collections.abc import Generator
5
+ from copy import deepcopy
5
6
  from typing import Any
6
7
 
7
8
  import pandas as pd
@@ -28,8 +29,8 @@ class DataRecord:
28
29
 
29
30
  def __init__(
30
31
  self,
31
- schema: BaseModel,
32
- source_indices: str | list[str],
32
+ data_item: BaseModel,
33
+ source_indices: str | int | list[str | int],
33
34
  parent_ids: str | list[str] | None = None,
34
35
  cardinality_idx: int | None = None,
35
36
  ):
@@ -44,27 +45,21 @@ class DataRecord:
44
45
  if isinstance(parent_ids, str):
45
46
  parent_ids = [parent_ids]
46
47
 
47
- # schema for the data record
48
- self.schema = schema
49
-
50
- # mapping from field names to Field objects; effectively a mapping from a field name to its type
51
- self.field_types: dict[str, FieldInfo] = schema.model_fields
52
-
53
- # mapping from field names to their values
54
- self.field_values: dict[str, Any] = {}
48
+ # data for the data record
49
+ self._data_item = data_item
55
50
 
56
51
  # the index in the root Dataset from which this DataRecord is derived;
57
52
  # each source index takes the form: f"{root_dataset.id}-{idx}"
58
- self.source_indices = sorted(source_indices)
53
+ self._source_indices = sorted(source_indices)
59
54
 
60
55
  # the id(s) of the parent record(s) from which this DataRecord is derived
61
- self.parent_ids = parent_ids
56
+ self._parent_ids = parent_ids
62
57
 
63
58
  # store the cardinality index
64
- self.cardinality_idx = cardinality_idx
59
+ self._cardinality_idx = cardinality_idx
65
60
 
66
61
  # indicator variable which may be flipped by filter operations to signal when a record has been filtered out
67
- self.passed_operator = True
62
+ self._passed_operator = True
68
63
 
69
64
  # NOTE: Record ids are hashed based on:
70
65
  # 0. their schema (keys)
@@ -78,106 +73,98 @@ class DataRecord:
78
73
  # We may revisit this hashing scheme in the future.
79
74
 
80
75
  # unique identifier for the record
76
+ schema_fields = sorted(list(type(data_item).model_fields))
81
77
  id_str = (
82
- str(schema) + str(parent_ids) if parent_ids is not None else str(self.source_indices)
78
+ str(schema_fields) + str(parent_ids) if parent_ids is not None else str(self._source_indices)
83
79
  if cardinality_idx is None
84
- else str(schema) + str(cardinality_idx) + str(parent_ids) if parent_ids is not None else str(self.source_indices)
80
+ else str(schema_fields) + str(cardinality_idx) + str(parent_ids) if parent_ids is not None else str(self._source_indices)
85
81
  )
86
- # TODO(Jun): build-in id should has a special name, the current self.id is too general which would conflict with user defined schema too easily.
87
- # the options: built_in_id, generated_id
88
- self.id = hash_for_id(id_str)
82
+ self._id = hash_for_id(id_str)
89
83
 
90
84
 
85
+ # TODO: raise an exception if one of these fields is present in the schema
86
+ # - put these in a constant list up top
87
+ # - import the constant list in Dataset (if possible) and check at plan creation time
91
88
  def __setattr__(self, name: str, value: Any, /) -> None:
92
- if name in ["schema", "field_types", "field_values", "source_indices", "parent_ids", "cardinality_idx", "passed_operator", "id"]:
89
+ if name in ["_data_item", "_source_indices", "_parent_ids", "_cardinality_idx", "_passed_operator", "_id"]:
93
90
  super().__setattr__(name, value)
94
91
  else:
95
- self.field_values[name] = value
92
+ setattr(self._data_item, name, value)
96
93
 
97
94
 
98
95
  def __getattr__(self, name: str) -> Any:
99
- if name == "field_values":
100
- pass
101
- elif name in self.field_values:
102
- return self.field_values[name]
103
- else:
104
- raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
105
-
96
+ field = getattr(self._data_item, name, None)
97
+ if field is not None:
98
+ return field
99
+ raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
106
100
 
107
101
  def __getitem__(self, field: str) -> Any:
108
- return self.__getattr__(field)
102
+ return getattr(self._data_item, field)
109
103
 
110
104
 
111
105
  def __setitem__(self, field: str, value: Any) -> None:
112
- self.__setattr__(field, value)
106
+ setattr(self._data_item, field, value)
113
107
 
114
108
 
115
109
  def __str__(self, truncate: int | None = 15) -> str:
116
110
  if truncate is not None:
117
- items = (f"{k}={str(v)[:truncate]!r}{'...' if len(str(v)) > truncate else ''}" for k, v in sorted(self.field_values.items()))
111
+ items = (f"{k}={str(v)[:truncate]!r}{'...' if len(str(v)) > truncate else ''}" for k, v in sorted(self._data_item.model_dump().items()))
118
112
  else:
119
- items = (f"{k}={v!r}" for k, v in sorted(self.field_values.items()))
113
+ items = (f"{k}={v!r}" for k, v in sorted(self._data_item.model_dump().items()))
120
114
  return "{}({})".format(type(self).__name__, ", ".join(items))
121
115
 
116
+
122
117
  def __repr__(self) -> str:
123
118
  return self.__str__(truncate=None)
124
119
 
120
+
125
121
  def __eq__(self, other):
126
- return isinstance(other, DataRecord) and self.field_values == other.field_values and self.schema == other.schema
122
+ return isinstance(other, DataRecord) and self._data_item == other._data_item
123
+
127
124
 
128
125
  def __hash__(self):
129
126
  return hash(self.to_json_str(bytes_to_str=True, sorted=True))
130
127
 
131
128
 
132
129
  def __iter__(self):
133
- yield from self.field_values.items()
130
+ yield from self._data_item.__iter__()
134
131
 
135
132
 
136
133
  def get_field_names(self):
137
- return list(self.field_values.keys())
134
+ return list(type(self._data_item).model_fields.keys())
138
135
 
139
136
 
140
137
  def get_field_type(self, field_name: str) -> FieldInfo:
141
- return self.field_types[field_name]
138
+ return type(self._data_item).model_fields[field_name]
139
+
140
+ @property
141
+ def schema(self) -> type[BaseModel]:
142
+ return type(self._data_item)
142
143
 
144
+ def copy(self):
145
+ # get the set of fields to copy from the parent record
146
+ copy_field_names = [field.split(".")[-1] for field in self.get_field_names()]
147
+
148
+ # copy field types and values from the parent
149
+ data_item = {field_name: self[field_name] for field_name in copy_field_names}
143
150
 
144
- def copy(self, include_bytes: bool = True, project_cols: list[str] | None = None):
145
151
  # make copy of the current record
146
152
  new_dr = DataRecord(
147
- self.schema,
148
- source_indices=self.source_indices,
149
- parent_ids=self.parent_ids,
150
- cardinality_idx=self.cardinality_idx,
153
+ self.schema(**data_item),
154
+ source_indices=self._source_indices,
155
+ parent_ids=self._parent_ids,
156
+ cardinality_idx=self._cardinality_idx,
151
157
  )
152
158
 
153
159
  # copy the passed_operator attribute
154
- new_dr.passed_operator = self.passed_operator
155
-
156
- # get the set of fields to copy from the parent record
157
- copy_field_names = project_cols if project_cols is not None else self.get_field_names()
158
- copy_field_names = [field.split(".")[-1] for field in copy_field_names]
159
-
160
- # copy field types and values from the parent
161
- for field_name in copy_field_names:
162
- field_type = self.get_field_type(field_name)
163
- field_value = self[field_name]
164
- if (
165
- not include_bytes
166
- and isinstance(field_value, bytes)
167
- or (isinstance(field_value, list) and len(field_value) > 0 and isinstance(field_value[0], bytes))
168
- ):
169
- continue
170
-
171
- # set field and value
172
- new_dr.field_types[field_name] = field_type
173
- new_dr[field_name] = field_value
160
+ new_dr._passed_operator = self._passed_operator
174
161
 
175
162
  return new_dr
176
163
 
177
-
178
164
  @staticmethod
179
165
  def from_parent(
180
- schema: BaseModel,
166
+ schema: type[BaseModel],
167
+ data_item: dict,
181
168
  parent_record: DataRecord,
182
169
  project_cols: list[str] | None = None,
183
170
  cardinality_idx: int | None = None,
@@ -194,29 +181,33 @@ class DataRecord:
194
181
  new_schema = union_schemas([schema, parent_record.schema])
195
182
  new_schema = project(new_schema, project_cols)
196
183
 
197
- # make new record which has parent_record as its parent (and the same source_indices)
198
- new_dr = DataRecord(
199
- new_schema,
200
- source_indices=parent_record.source_indices,
201
- parent_ids=[parent_record.id],
202
- cardinality_idx=cardinality_idx,
203
- )
204
-
205
184
  # get the set of fields and field descriptions to copy from the parent record
206
- copy_field_names = project_cols if project_cols is not None else parent_record.get_field_names()
185
+ copy_field_names = parent_record.get_field_names() if project_cols is None else project_cols
207
186
  copy_field_names = [field.split(".")[-1] for field in copy_field_names]
208
187
 
209
188
  # copy fields from the parent
210
- for field_name in copy_field_names:
211
- new_dr.field_types[field_name] = parent_record.get_field_type(field_name)
212
- new_dr[field_name] = parent_record[field_name]
189
+ data_item.update({field_name: parent_record[field_name] for field_name in copy_field_names})
213
190
 
214
- return new_dr
191
+ # corner-case: wrap values in lists if the new schema expects a list but the data item has a single value
192
+ for field_name, field_info in new_schema.model_fields.items():
193
+ field_should_be_list = hasattr(field_info.annotation, '__origin__') and field_info.annotation.__origin__ is list
194
+ field_is_not_list = field_name in data_item and not isinstance(data_item[field_name], list)
195
+ if field_should_be_list and field_is_not_list:
196
+ data_item[field_name] = [data_item[field_name]]
197
+
198
+ # make new record which has parent_record as its parent (and the same source_indices)
199
+ new_dr = DataRecord(
200
+ new_schema(**data_item),
201
+ source_indices=parent_record._source_indices,
202
+ parent_ids=[parent_record._id],
203
+ cardinality_idx=cardinality_idx,
204
+ )
215
205
 
206
+ return new_dr
216
207
 
217
208
  @staticmethod
218
209
  def from_agg_parents(
219
- schema: BaseModel,
210
+ data_item: BaseModel,
220
211
  parent_records: DataRecordSet,
221
212
  cardinality_idx: int | None = None,
222
213
  ) -> DataRecord:
@@ -224,33 +215,25 @@ class DataRecord:
224
215
  source_indices = [
225
216
  source_idx
226
217
  for parent_record in parent_records
227
- for source_idx in parent_record.source_indices
218
+ for source_idx in parent_record._source_indices
228
219
  ]
229
220
 
230
221
  # make new record which has all parent records as its parents
231
222
  return DataRecord(
232
- schema,
223
+ data_item,
233
224
  source_indices=source_indices,
234
- parent_ids=[parent_record.id for parent_record in parent_records],
225
+ parent_ids=[parent_record._id for parent_record in parent_records],
235
226
  cardinality_idx=cardinality_idx,
236
227
  )
237
228
 
238
229
  @staticmethod
239
230
  def from_join_parents(
240
- schema: BaseModel,
231
+ schema: type[BaseModel],
241
232
  left_parent_record: DataRecord,
242
233
  right_parent_record: DataRecord,
243
234
  project_cols: list[str] | None = None,
244
235
  cardinality_idx: int = None,
245
236
  ) -> DataRecord:
246
- # make new record which has left and right parent record as its parents
247
- new_dr = DataRecord(
248
- schema,
249
- source_indices=list(left_parent_record.source_indices) + list(right_parent_record.source_indices),
250
- parent_ids=[left_parent_record.id, right_parent_record.id],
251
- cardinality_idx=cardinality_idx,
252
- )
253
-
254
237
  # get the set of fields and field descriptions to copy from the parent record(s)
255
238
  left_copy_field_names = (
256
239
  left_parent_record.get_field_names()
@@ -266,23 +249,26 @@ class DataRecord:
266
249
  right_copy_field_names = [field.split(".")[-1] for field in right_copy_field_names]
267
250
 
268
251
  # copy fields from the parents
269
- for field_name in left_copy_field_names:
270
- new_dr.field_types[field_name] = left_parent_record.get_field_type(field_name)
271
- new_dr[field_name] = left_parent_record[field_name]
272
-
252
+ data_item = {field_name: left_parent_record[field_name] for field_name in left_copy_field_names}
273
253
  for field_name in right_copy_field_names:
274
254
  new_field_name = field_name
275
255
  if field_name in left_copy_field_names:
276
256
  new_field_name = f"{field_name}_right"
277
- new_dr.field_types[new_field_name] = right_parent_record.get_field_type(field_name)
278
- new_dr[new_field_name] = right_parent_record[field_name]
257
+ data_item[new_field_name] = right_parent_record[field_name]
279
258
 
280
- return new_dr
259
+ # make new record which has left and right parent record as its parents
260
+ new_dr = DataRecord(
261
+ schema(**data_item),
262
+ source_indices=list(left_parent_record._source_indices) + list(right_parent_record._source_indices),
263
+ parent_ids=[left_parent_record._id, right_parent_record._id],
264
+ cardinality_idx=cardinality_idx,
265
+ )
281
266
 
267
+ return new_dr
282
268
 
283
269
  # TODO: unused outside of unit tests
284
270
  @staticmethod
285
- def from_df(df: pd.DataFrame, schema: BaseModel | None = None) -> list[DataRecord]:
271
+ def from_df(df: pd.DataFrame, schema: type[BaseModel] | None = None) -> list[DataRecord]:
286
272
  """Create a list of DataRecords from a pandas DataFrame
287
273
 
288
274
  Args:
@@ -309,9 +295,7 @@ class DataRecord:
309
295
  records = []
310
296
  for idx, row in df.iterrows():
311
297
  row_dict = row.to_dict()
312
- record = DataRecord(schema=schema, source_indices=[f"{dataset_id}-{idx}"])
313
- record.field_values = row_dict
314
- record.field_types = {field_name: schema.model_fields[field_name] for field_name in row_dict}
298
+ record = DataRecord(schema(**row_dict), source_indices=[f"{dataset_id}-{idx}"])
315
299
  records.append(record)
316
300
 
317
301
  return records
@@ -346,9 +330,8 @@ class DataRecord:
346
330
  # TODO(chjun): In case of numpy types, the json.dumps will fail. Convert to native types.
347
331
  # Better ways to handle this.
348
332
  field_values = {
349
- k: v.description
350
- if isinstance(v, context.Context) else v
351
- for k, v in self.field_values.items()
333
+ k: v.description if isinstance(v, context.Context) else v
334
+ for k, v in self._data_item.model_dump().items()
352
335
  }
353
336
  dct = pd.Series(field_values).to_dict()
354
337
 
@@ -358,7 +341,7 @@ class DataRecord:
358
341
 
359
342
  if not include_bytes:
360
343
  for k in dct:
361
- field_type = self.field_types[k]
344
+ field_type = self.get_field_type(k)
362
345
  if field_type.annotation in [bytes, AudioBase64, ImageBase64, list[bytes], list[ImageBase64]]:
363
346
  dct[k] = "<bytes>"
364
347
 
@@ -374,11 +357,11 @@ class DataRecord:
374
357
 
375
358
  if mask_filepaths:
376
359
  for k in dct:
377
- field_type = self.field_types[k]
360
+ field_type = self.get_field_type(k)
378
361
  if field_type.annotation in [AudioBase64, AudioFilepath, ImageBase64, ImageFilepath, ImageURL]:
379
362
  dct[k] = "<bytes>"
380
363
 
381
- return dct
364
+ return deepcopy(dct)
382
365
 
383
366
 
384
367
  class DataRecordSet:
@@ -399,8 +382,8 @@ class DataRecordSet:
399
382
  # set data_records, parent_ids, and source_indices; note that it is possible for
400
383
  # data_records to be an empty list in the event of a failed convert
401
384
  self.data_records = data_records
402
- self.parent_ids = data_records[0].parent_ids if len(data_records) > 0 else None
403
- self.source_indices = data_records[0].source_indices if len(data_records) > 0 else None
385
+ self.parent_ids = data_records[0]._parent_ids if len(data_records) > 0 else None
386
+ self.source_indices = data_records[0]._source_indices if len(data_records) > 0 else None
404
387
  self.schema = data_records[0].schema if len(data_records) > 0 else None
405
388
 
406
389
  # the input to the operator which produced the data_records; type is tuple[DataRecord] | tuple[int]
@@ -448,7 +431,6 @@ class DataRecordCollection:
448
431
  DataRecordSet is used for the output of executing an operator.
449
432
  DataRecordCollection is used for the output of executing a query, we definitely could extend it to support more advanced features for output of execute().
450
433
  """
451
- # TODO(Jun): consider to have stats_manager class to centralize stats management.
452
434
  def __init__(self, data_records: list[DataRecord], execution_stats: ExecutionStats | None = None, plan_stats: PlanStats | None = None):
453
435
  self.data_records = data_records
454
436
  self.execution_stats = execution_stats