palimpzest 0.8.1__tar.gz → 0.8.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {palimpzest-0.8.1/src/palimpzest.egg-info → palimpzest-0.8.3}/PKG-INFO +3 -8
- {palimpzest-0.8.1 → palimpzest-0.8.3}/pyproject.toml +3 -9
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/constants.py +38 -62
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/data/dataset.py +1 -1
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/data/iter_dataset.py +5 -5
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/elements/groupbysig.py +1 -1
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/elements/records.py +91 -109
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/lib/schemas.py +23 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/models.py +3 -3
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/prompts/__init__.py +2 -6
- palimpzest-0.8.3/src/palimpzest/prompts/convert_prompts.py +87 -0
- palimpzest-0.8.3/src/palimpzest/prompts/critique_and_refine_prompts.py +66 -0
- palimpzest-0.8.3/src/palimpzest/prompts/filter_prompts.py +76 -0
- palimpzest-0.8.3/src/palimpzest/prompts/join_prompts.py +100 -0
- palimpzest-0.8.1/src/palimpzest/prompts/moa_aggregator_convert_prompts.py → palimpzest-0.8.3/src/palimpzest/prompts/moa_aggregator_prompts.py +51 -2
- palimpzest-0.8.3/src/palimpzest/prompts/moa_proposer_prompts.py +87 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/prompts/prompt_factory.py +351 -479
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/prompts/split_merge_prompts.py +51 -2
- palimpzest-0.8.3/src/palimpzest/prompts/split_proposer_prompts.py +87 -0
- palimpzest-0.8.3/src/palimpzest/prompts/utils.py +109 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/execution/execution_strategy.py +4 -4
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/execution/mab_execution_strategy.py +47 -23
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/execution/parallel_execution_strategy.py +3 -3
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/execution/single_threaded_execution_strategy.py +8 -8
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/generators/generators.py +31 -17
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/__init__.py +15 -2
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/aggregate.py +21 -19
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/compute.py +6 -8
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/convert.py +12 -37
- palimpzest-0.8.3/src/palimpzest/query/operators/critique_and_refine.py +194 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/distinct.py +7 -7
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/filter.py +13 -25
- palimpzest-0.8.3/src/palimpzest/query/operators/join.py +532 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/limit.py +4 -4
- palimpzest-0.8.3/src/palimpzest/query/operators/mixture_of_agents.py +246 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/physical.py +25 -2
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/project.py +4 -4
- palimpzest-0.8.1/src/palimpzest/query/operators/rag_convert.py → palimpzest-0.8.3/src/palimpzest/query/operators/rag.py +202 -5
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/retrieve.py +10 -9
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/scan.py +9 -10
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/search.py +18 -24
- palimpzest-0.8.3/src/palimpzest/query/operators/split.py +321 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/optimizer/__init__.py +12 -8
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/optimizer/optimizer.py +12 -10
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/optimizer/rules.py +201 -108
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/optimizer/tasks.py +18 -6
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/processor/config.py +2 -2
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/processor/query_processor.py +2 -2
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/processor/query_processor_factory.py +9 -5
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/validator/validator.py +7 -9
- {palimpzest-0.8.1 → palimpzest-0.8.3/src/palimpzest.egg-info}/PKG-INFO +3 -8
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest.egg-info/SOURCES.txt +8 -8
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest.egg-info/requires.txt +1 -7
- palimpzest-0.8.1/src/palimpzest/prompts/convert_prompts.py +0 -143
- palimpzest-0.8.1/src/palimpzest/prompts/critique_and_refine_convert_prompts.py +0 -216
- palimpzest-0.8.1/src/palimpzest/prompts/filter_prompts.py +0 -114
- palimpzest-0.8.1/src/palimpzest/prompts/join_prompts.py +0 -163
- palimpzest-0.8.1/src/palimpzest/prompts/moa_proposer_convert_prompts.py +0 -75
- palimpzest-0.8.1/src/palimpzest/prompts/split_proposer_prompts.py +0 -55
- palimpzest-0.8.1/src/palimpzest/prompts/util_phrases.py +0 -19
- palimpzest-0.8.1/src/palimpzest/query/operators/critique_and_refine_convert.py +0 -113
- palimpzest-0.8.1/src/palimpzest/query/operators/join.py +0 -403
- palimpzest-0.8.1/src/palimpzest/query/operators/mixture_of_agents_convert.py +0 -140
- palimpzest-0.8.1/src/palimpzest/query/operators/split_convert.py +0 -170
- {palimpzest-0.8.1 → palimpzest-0.8.3}/LICENSE +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/README.md +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/setup.cfg +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/agents/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/agents/compute_agents.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/agents/search_agents.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/data/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/data/context.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/data/context_manager.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/data/index_dataset.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/elements/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/elements/filters.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/core/lib/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/policy.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/prompts/agent_prompts.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/prompts/context_search.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/prompts/validator.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/execution/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/execution/execution_strategy_type.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/generators/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/operators/logical.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/optimizer/cost_model.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/optimizer/optimizer_strategy.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/optimizer/optimizer_strategy_type.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/optimizer/plan.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/optimizer/primitives.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/query/processor/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/schemabuilder/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/schemabuilder/schema_builder.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/tools/README.md +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/tools/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/tools/allenpdf.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/tools/pdfparser.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/tools/skema_tools.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/utils/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/utils/env_helpers.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/utils/hash_helpers.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/utils/model_helpers.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/utils/progress.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/utils/udfs.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest/validator/__init__.py +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest.egg-info/dependency_links.txt +0 -0
- {palimpzest-0.8.1 → palimpzest-0.8.3}/src/palimpzest.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: palimpzest
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.3
|
|
4
4
|
Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
|
|
5
5
|
Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
|
|
6
6
|
Project-URL: homepage, https://palimpzest.org
|
|
@@ -12,7 +12,7 @@ Classifier: Intended Audience :: Developers
|
|
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
-
Requires-Python: >=3.
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
18
|
Requires-Dist: anthropic>=0.55.0
|
|
@@ -22,7 +22,7 @@ Requires-Dist: colorama>=0.4.6
|
|
|
22
22
|
Requires-Dist: datasets>=4.0.0
|
|
23
23
|
Requires-Dist: fastapi~=0.115.0
|
|
24
24
|
Requires-Dist: gradio>=5.26.0
|
|
25
|
-
Requires-Dist: litellm>=1.
|
|
25
|
+
Requires-Dist: litellm>=1.76.1
|
|
26
26
|
Requires-Dist: numpy==2.0.2
|
|
27
27
|
Requires-Dist: openai>=1.0
|
|
28
28
|
Requires-Dist: pandas>=2.1.1
|
|
@@ -44,11 +44,6 @@ Requires-Dist: tabulate>=0.9.0
|
|
|
44
44
|
Requires-Dist: together>=1.5.5
|
|
45
45
|
Requires-Dist: tqdm~=4.66.1
|
|
46
46
|
Requires-Dist: rich[jupyter]>=13.9.2
|
|
47
|
-
Provides-Extra: docs
|
|
48
|
-
Requires-Dist: mkdocs>=1.6.1; extra == "docs"
|
|
49
|
-
Requires-Dist: mkdocs-material>=9.6.3; extra == "docs"
|
|
50
|
-
Requires-Dist: mkdocstrings-python>=1.15.0; extra == "docs"
|
|
51
|
-
Requires-Dist: mkdocs-material[imaging]; extra == "docs"
|
|
52
47
|
Provides-Extra: vllm
|
|
53
48
|
Requires-Dist: vllm>=0.10.1.1; extra == "vllm"
|
|
54
49
|
Dynamic: license-file
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "palimpzest"
|
|
3
|
-
version = "0.8.
|
|
3
|
+
version = "0.8.3"
|
|
4
4
|
description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
|
|
5
5
|
readme = "README.md"
|
|
6
|
-
requires-python = ">=3.
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
7
|
keywords = ["relational", "optimization", "llm", "AI programming", "extraction", "tools", "document", "search", "integration"]
|
|
8
8
|
authors = [
|
|
9
9
|
{name="MIT DSG Semantic Management Lab", email="michjc@csail.mit.edu"},
|
|
@@ -16,7 +16,7 @@ dependencies = [
|
|
|
16
16
|
"datasets>=4.0.0",
|
|
17
17
|
"fastapi~=0.115.0",
|
|
18
18
|
"gradio>=5.26.0",
|
|
19
|
-
"litellm>=1.
|
|
19
|
+
"litellm>=1.76.1",
|
|
20
20
|
"numpy==2.0.2",
|
|
21
21
|
"openai>=1.0",
|
|
22
22
|
"pandas>=2.1.1",
|
|
@@ -49,12 +49,6 @@ classifiers=[
|
|
|
49
49
|
]
|
|
50
50
|
|
|
51
51
|
[project.optional-dependencies]
|
|
52
|
-
docs = [
|
|
53
|
-
"mkdocs>=1.6.1",
|
|
54
|
-
"mkdocs-material>=9.6.3",
|
|
55
|
-
"mkdocstrings-python>=1.15.0",
|
|
56
|
-
"mkdocs-material[imaging]",
|
|
57
|
-
]
|
|
58
52
|
vllm = [
|
|
59
53
|
"vllm>=0.10.1.1",
|
|
60
54
|
]
|
|
@@ -25,8 +25,6 @@ class Model(str, Enum):
|
|
|
25
25
|
GPT_5_MINI = "openai/gpt-5-mini-2025-08-07"
|
|
26
26
|
GPT_5_NANO = "openai/gpt-5-nano-2025-08-07"
|
|
27
27
|
o4_MINI = "openai/o4-mini-2025-04-16" # noqa: N815
|
|
28
|
-
TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
|
|
29
|
-
CLIP_VIT_B_32 = "clip-ViT-B-32"
|
|
30
28
|
CLAUDE_3_5_SONNET = "anthropic/claude-3-5-sonnet-20241022"
|
|
31
29
|
CLAUDE_3_7_SONNET = "anthropic/claude-3-7-sonnet-20250219"
|
|
32
30
|
CLAUDE_3_5_HAIKU = "anthropic/claude-3-5-haiku-20241022"
|
|
@@ -41,6 +39,8 @@ class Model(str, Enum):
|
|
|
41
39
|
GPT_4o_MINI_AUDIO_PREVIEW = "openai/gpt-4o-mini-audio-preview"
|
|
42
40
|
VLLM_QWEN_1_5_0_5B_CHAT = "hosted_vllm/qwen/Qwen1.5-0.5B-Chat"
|
|
43
41
|
# o1 = "o1-2024-12-17"
|
|
42
|
+
TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
|
|
43
|
+
CLIP_VIT_B_32 = "clip-ViT-B-32"
|
|
44
44
|
|
|
45
45
|
def __repr__(self):
|
|
46
46
|
return f"{self.name}"
|
|
@@ -136,69 +136,38 @@ class PromptStrategy(str, Enum):
|
|
|
136
136
|
performing some task with a specified Model.
|
|
137
137
|
"""
|
|
138
138
|
|
|
139
|
-
#
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
#
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
COT_QA_REFINE = "chain-of-thought-question-refine"
|
|
166
|
-
|
|
167
|
-
# Chain-of-Thought Question with Image Prompt Strategies
|
|
168
|
-
COT_QA_IMAGE = "chain-of-thought-question-image"
|
|
169
|
-
COT_QA_IMAGE_NO_REASONING = "chain-of-thought-question-image-no-reasoning"
|
|
170
|
-
COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
|
|
171
|
-
COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
|
|
172
|
-
|
|
173
|
-
# Chain-of-Thought Queestion with Audio Prompt Strategies
|
|
174
|
-
COT_QA_AUDIO = "chain-of-thought-question-audio"
|
|
175
|
-
COT_QA_AUDIO_NO_REASONING = "chain-of-thought-question-audio-no-reasoning"
|
|
176
|
-
# TODO: COT_QA_AUDIO_CRITIC/REFINE
|
|
177
|
-
|
|
178
|
-
# Mixture-of-Agents Prompt Strategies
|
|
179
|
-
COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
|
|
180
|
-
COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
|
|
181
|
-
COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
|
|
182
|
-
# TODO: COT_MOA_PROPOSER_AUDIO
|
|
183
|
-
|
|
184
|
-
# Split Convert Prompt Strategies
|
|
185
|
-
SPLIT_PROPOSER = "split-proposer"
|
|
186
|
-
SPLIT_MERGER = "split-merger"
|
|
187
|
-
|
|
188
|
-
def is_image_prompt(self):
|
|
189
|
-
return "image" in self.value
|
|
190
|
-
|
|
191
|
-
def is_audio_prompt(self):
|
|
192
|
-
return "audio" in self.value
|
|
193
|
-
|
|
194
|
-
def is_bool_prompt(self):
|
|
195
|
-
return "bool" in self.value
|
|
139
|
+
# filter prompt strategies
|
|
140
|
+
FILTER = "filter"
|
|
141
|
+
FILTER_NO_REASONING = "filter-no-reasoning"
|
|
142
|
+
FILTER_CRITIC = "filter-critic"
|
|
143
|
+
FILTER_REFINE = "filter-refine"
|
|
144
|
+
FILTER_MOA_PROPOSER = "filter-mixture-of-agents-proposer"
|
|
145
|
+
FILTER_MOA_AGG = "filter-mixture-of-agents-aggregation"
|
|
146
|
+
FILTER_SPLIT_PROPOSER = "filter-split-proposer"
|
|
147
|
+
FILTER_SPLIT_MERGER = "filter-split-merger"
|
|
148
|
+
|
|
149
|
+
# join prompt strategies
|
|
150
|
+
JOIN = "join"
|
|
151
|
+
JOIN_NO_REASONING = "join-no-reasoning"
|
|
152
|
+
|
|
153
|
+
# map prompt strategies
|
|
154
|
+
MAP = "map"
|
|
155
|
+
MAP_NO_REASONING = "map-no-reasoning"
|
|
156
|
+
MAP_CRITIC = "map-critic"
|
|
157
|
+
MAP_REFINE = "map-refine"
|
|
158
|
+
MAP_MOA_PROPOSER = "map-mixture-of-agents-proposer"
|
|
159
|
+
MAP_MOA_AGG = "map-mixture-of-agents-aggregation"
|
|
160
|
+
MAP_SPLIT_PROPOSER = "map-split-proposer"
|
|
161
|
+
MAP_SPLIT_MERGER = "map-split-merger"
|
|
162
|
+
|
|
163
|
+
def is_filter_prompt(self):
|
|
164
|
+
return "filter" in self.value
|
|
196
165
|
|
|
197
166
|
def is_join_prompt(self):
|
|
198
167
|
return "join" in self.value
|
|
199
168
|
|
|
200
|
-
def
|
|
201
|
-
return "
|
|
169
|
+
def is_map_prompt(self):
|
|
170
|
+
return "map" in self.value
|
|
202
171
|
|
|
203
172
|
def is_critic_prompt(self):
|
|
204
173
|
return "critic" in self.value
|
|
@@ -221,6 +190,13 @@ class PromptStrategy(str, Enum):
|
|
|
221
190
|
def is_no_reasoning_prompt(self):
|
|
222
191
|
return "no-reasoning" in self.value
|
|
223
192
|
|
|
193
|
+
|
|
194
|
+
class Modality(str, Enum):
|
|
195
|
+
TEXT = "text"
|
|
196
|
+
IMAGE = "image"
|
|
197
|
+
AUDIO = "audio"
|
|
198
|
+
|
|
199
|
+
|
|
224
200
|
class AggFunc(str, Enum):
|
|
225
201
|
COUNT = "count"
|
|
226
202
|
AVERAGE = "average"
|
|
@@ -527,7 +503,7 @@ CLIP_VIT_B_32_MODEL_CARD = {
|
|
|
527
503
|
##### Time #####
|
|
528
504
|
"seconds_per_output_token": 0.0098, # NOTE: just copying TEXT_EMBEDDING_3_SMALL_MODEL_CARD for now
|
|
529
505
|
##### Agg. Benchmark #####
|
|
530
|
-
"overall": 63.3, # NOTE:
|
|
506
|
+
"overall": 63.3, # NOTE: imageNet top-1 accuracy
|
|
531
507
|
}
|
|
532
508
|
CLAUDE_3_5_SONNET_MODEL_CARD = {
|
|
533
509
|
##### Cost in USD #####
|
|
@@ -595,7 +595,7 @@ class Dataset:
|
|
|
595
595
|
|
|
596
596
|
return QueryProcessorFactory.create_and_run_processor(self, config)
|
|
597
597
|
|
|
598
|
-
def optimize_and_run(self, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None,
|
|
598
|
+
def optimize_and_run(self, config: QueryProcessorConfig | None = None, train_dataset: dict[str, Dataset] | Dataset | None = None, validator: Validator | None = None, **kwargs):
|
|
599
599
|
"""Optimize the PZ program using the train_dataset and validator before running the optimized plan."""
|
|
600
600
|
# TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR
|
|
601
601
|
from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory
|
|
@@ -227,7 +227,7 @@ class HTMLFileDataset(BaseFileDataset):
|
|
|
227
227
|
path (str): The path to the directory
|
|
228
228
|
"""
|
|
229
229
|
super().__init__(path=path, id=id, schema=WebPage)
|
|
230
|
-
|
|
230
|
+
self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.HTML_EXTENSIONS))]
|
|
231
231
|
|
|
232
232
|
def _html_to_text_with_links(self, html: str) -> str:
|
|
233
233
|
# Parse the HTML content
|
|
@@ -295,7 +295,7 @@ class ImageFileDataset(BaseFileDataset):
|
|
|
295
295
|
path (str): The path to the directory
|
|
296
296
|
"""
|
|
297
297
|
super().__init__(path=path, id=id, schema=ImageFile)
|
|
298
|
-
|
|
298
|
+
self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.IMAGE_EXTENSIONS))]
|
|
299
299
|
|
|
300
300
|
def __getitem__(self, idx: int) -> dict:
|
|
301
301
|
"""
|
|
@@ -347,7 +347,7 @@ class PDFFileDataset(BaseFileDataset):
|
|
|
347
347
|
file_cache_dir (str): The directory to store the temporary files generated during PDF processing
|
|
348
348
|
"""
|
|
349
349
|
super().__init__(path=path, id=id, schema=PDFFile)
|
|
350
|
-
|
|
350
|
+
self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.PDF_EXTENSIONS))]
|
|
351
351
|
self.pdfprocessor = pdfprocessor
|
|
352
352
|
self.file_cache_dir = file_cache_dir
|
|
353
353
|
|
|
@@ -432,7 +432,7 @@ class XLSFileDataset(BaseFileDataset):
|
|
|
432
432
|
Constructor for the `XLSFileDataset` class. The `schema` is set to the `XLSFile` schema.
|
|
433
433
|
"""
|
|
434
434
|
super().__init__(path=path, id=id, schema=XLSFile)
|
|
435
|
-
|
|
435
|
+
self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.XLS_EXTENSIONS))]
|
|
436
436
|
|
|
437
437
|
def __getitem__(self, idx: int) -> dict:
|
|
438
438
|
"""
|
|
@@ -483,7 +483,7 @@ class AudioFileDataset(BaseFileDirectoryDataset):
|
|
|
483
483
|
path (str): The path to the directory
|
|
484
484
|
"""
|
|
485
485
|
super().__init__(path=path, id=id, schema=AudioFile)
|
|
486
|
-
|
|
486
|
+
self.filepaths = [fp for fp in self.filepaths if fp.endswith(tuple(constants.AUDIO_EXTENSIONS))]
|
|
487
487
|
|
|
488
488
|
def __getitem__(self, idx: int) -> dict:
|
|
489
489
|
"""
|
|
@@ -16,7 +16,7 @@ class GroupBySig:
|
|
|
16
16
|
self.agg_funcs = agg_funcs
|
|
17
17
|
self.agg_fields = agg_fields
|
|
18
18
|
|
|
19
|
-
def validate_schema(self, input_schema: BaseModel) -> tuple[bool, str | None]:
|
|
19
|
+
def validate_schema(self, input_schema: type[BaseModel]) -> tuple[bool, str | None]:
|
|
20
20
|
for f in self.group_by_fields:
|
|
21
21
|
if f not in input_schema.model_fields:
|
|
22
22
|
return (False, "Supplied schema has no field " + f)
|
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
from collections.abc import Generator
|
|
5
|
+
from copy import deepcopy
|
|
5
6
|
from typing import Any
|
|
6
7
|
|
|
7
8
|
import pandas as pd
|
|
@@ -28,8 +29,8 @@ class DataRecord:
|
|
|
28
29
|
|
|
29
30
|
def __init__(
|
|
30
31
|
self,
|
|
31
|
-
|
|
32
|
-
source_indices: str | list[str],
|
|
32
|
+
data_item: BaseModel,
|
|
33
|
+
source_indices: str | int | list[str | int],
|
|
33
34
|
parent_ids: str | list[str] | None = None,
|
|
34
35
|
cardinality_idx: int | None = None,
|
|
35
36
|
):
|
|
@@ -44,27 +45,21 @@ class DataRecord:
|
|
|
44
45
|
if isinstance(parent_ids, str):
|
|
45
46
|
parent_ids = [parent_ids]
|
|
46
47
|
|
|
47
|
-
#
|
|
48
|
-
self.
|
|
49
|
-
|
|
50
|
-
# mapping from field names to Field objects; effectively a mapping from a field name to its type
|
|
51
|
-
self.field_types: dict[str, FieldInfo] = schema.model_fields
|
|
52
|
-
|
|
53
|
-
# mapping from field names to their values
|
|
54
|
-
self.field_values: dict[str, Any] = {}
|
|
48
|
+
# data for the data record
|
|
49
|
+
self._data_item = data_item
|
|
55
50
|
|
|
56
51
|
# the index in the root Dataset from which this DataRecord is derived;
|
|
57
52
|
# each source index takes the form: f"{root_dataset.id}-{idx}"
|
|
58
|
-
self.
|
|
53
|
+
self._source_indices = sorted(source_indices)
|
|
59
54
|
|
|
60
55
|
# the id(s) of the parent record(s) from which this DataRecord is derived
|
|
61
|
-
self.
|
|
56
|
+
self._parent_ids = parent_ids
|
|
62
57
|
|
|
63
58
|
# store the cardinality index
|
|
64
|
-
self.
|
|
59
|
+
self._cardinality_idx = cardinality_idx
|
|
65
60
|
|
|
66
61
|
# indicator variable which may be flipped by filter operations to signal when a record has been filtered out
|
|
67
|
-
self.
|
|
62
|
+
self._passed_operator = True
|
|
68
63
|
|
|
69
64
|
# NOTE: Record ids are hashed based on:
|
|
70
65
|
# 0. their schema (keys)
|
|
@@ -78,106 +73,98 @@ class DataRecord:
|
|
|
78
73
|
# We may revisit this hashing scheme in the future.
|
|
79
74
|
|
|
80
75
|
# unique identifier for the record
|
|
76
|
+
schema_fields = sorted(list(type(data_item).model_fields))
|
|
81
77
|
id_str = (
|
|
82
|
-
str(
|
|
78
|
+
str(schema_fields) + str(parent_ids) if parent_ids is not None else str(self._source_indices)
|
|
83
79
|
if cardinality_idx is None
|
|
84
|
-
else str(
|
|
80
|
+
else str(schema_fields) + str(cardinality_idx) + str(parent_ids) if parent_ids is not None else str(self._source_indices)
|
|
85
81
|
)
|
|
86
|
-
|
|
87
|
-
# the options: built_in_id, generated_id
|
|
88
|
-
self.id = hash_for_id(id_str)
|
|
82
|
+
self._id = hash_for_id(id_str)
|
|
89
83
|
|
|
90
84
|
|
|
85
|
+
# TODO: raise an exception if one of these fields is present in the schema
|
|
86
|
+
# - put these in a constant list up top
|
|
87
|
+
# - import the constant list in Dataset (if possible) and check at plan creation time
|
|
91
88
|
def __setattr__(self, name: str, value: Any, /) -> None:
|
|
92
|
-
if name in ["
|
|
89
|
+
if name in ["_data_item", "_source_indices", "_parent_ids", "_cardinality_idx", "_passed_operator", "_id"]:
|
|
93
90
|
super().__setattr__(name, value)
|
|
94
91
|
else:
|
|
95
|
-
self.
|
|
92
|
+
setattr(self._data_item, name, value)
|
|
96
93
|
|
|
97
94
|
|
|
98
95
|
def __getattr__(self, name: str) -> Any:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
else:
|
|
104
|
-
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
|
|
105
|
-
|
|
96
|
+
field = getattr(self._data_item, name, None)
|
|
97
|
+
if field is not None:
|
|
98
|
+
return field
|
|
99
|
+
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
|
|
106
100
|
|
|
107
101
|
def __getitem__(self, field: str) -> Any:
|
|
108
|
-
return self.
|
|
102
|
+
return getattr(self._data_item, field)
|
|
109
103
|
|
|
110
104
|
|
|
111
105
|
def __setitem__(self, field: str, value: Any) -> None:
|
|
112
|
-
self.
|
|
106
|
+
setattr(self._data_item, field, value)
|
|
113
107
|
|
|
114
108
|
|
|
115
109
|
def __str__(self, truncate: int | None = 15) -> str:
|
|
116
110
|
if truncate is not None:
|
|
117
|
-
items = (f"{k}={str(v)[:truncate]!r}{'...' if len(str(v)) > truncate else ''}" for k, v in sorted(self.
|
|
111
|
+
items = (f"{k}={str(v)[:truncate]!r}{'...' if len(str(v)) > truncate else ''}" for k, v in sorted(self._data_item.model_dump().items()))
|
|
118
112
|
else:
|
|
119
|
-
items = (f"{k}={v!r}" for k, v in sorted(self.
|
|
113
|
+
items = (f"{k}={v!r}" for k, v in sorted(self._data_item.model_dump().items()))
|
|
120
114
|
return "{}({})".format(type(self).__name__, ", ".join(items))
|
|
121
115
|
|
|
116
|
+
|
|
122
117
|
def __repr__(self) -> str:
|
|
123
118
|
return self.__str__(truncate=None)
|
|
124
119
|
|
|
120
|
+
|
|
125
121
|
def __eq__(self, other):
|
|
126
|
-
return isinstance(other, DataRecord) and self.
|
|
122
|
+
return isinstance(other, DataRecord) and self._data_item == other._data_item
|
|
123
|
+
|
|
127
124
|
|
|
128
125
|
def __hash__(self):
|
|
129
126
|
return hash(self.to_json_str(bytes_to_str=True, sorted=True))
|
|
130
127
|
|
|
131
128
|
|
|
132
129
|
def __iter__(self):
|
|
133
|
-
yield from self.
|
|
130
|
+
yield from self._data_item.__iter__()
|
|
134
131
|
|
|
135
132
|
|
|
136
133
|
def get_field_names(self):
|
|
137
|
-
return list(self.
|
|
134
|
+
return list(type(self._data_item).model_fields.keys())
|
|
138
135
|
|
|
139
136
|
|
|
140
137
|
def get_field_type(self, field_name: str) -> FieldInfo:
|
|
141
|
-
return self.
|
|
138
|
+
return type(self._data_item).model_fields[field_name]
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def schema(self) -> type[BaseModel]:
|
|
142
|
+
return type(self._data_item)
|
|
142
143
|
|
|
144
|
+
def copy(self):
|
|
145
|
+
# get the set of fields to copy from the parent record
|
|
146
|
+
copy_field_names = [field.split(".")[-1] for field in self.get_field_names()]
|
|
147
|
+
|
|
148
|
+
# copy field types and values from the parent
|
|
149
|
+
data_item = {field_name: self[field_name] for field_name in copy_field_names}
|
|
143
150
|
|
|
144
|
-
def copy(self, include_bytes: bool = True, project_cols: list[str] | None = None):
|
|
145
151
|
# make copy of the current record
|
|
146
152
|
new_dr = DataRecord(
|
|
147
|
-
self.schema,
|
|
148
|
-
source_indices=self.
|
|
149
|
-
parent_ids=self.
|
|
150
|
-
cardinality_idx=self.
|
|
153
|
+
self.schema(**data_item),
|
|
154
|
+
source_indices=self._source_indices,
|
|
155
|
+
parent_ids=self._parent_ids,
|
|
156
|
+
cardinality_idx=self._cardinality_idx,
|
|
151
157
|
)
|
|
152
158
|
|
|
153
159
|
# copy the passed_operator attribute
|
|
154
|
-
new_dr.
|
|
155
|
-
|
|
156
|
-
# get the set of fields to copy from the parent record
|
|
157
|
-
copy_field_names = project_cols if project_cols is not None else self.get_field_names()
|
|
158
|
-
copy_field_names = [field.split(".")[-1] for field in copy_field_names]
|
|
159
|
-
|
|
160
|
-
# copy field types and values from the parent
|
|
161
|
-
for field_name in copy_field_names:
|
|
162
|
-
field_type = self.get_field_type(field_name)
|
|
163
|
-
field_value = self[field_name]
|
|
164
|
-
if (
|
|
165
|
-
not include_bytes
|
|
166
|
-
and isinstance(field_value, bytes)
|
|
167
|
-
or (isinstance(field_value, list) and len(field_value) > 0 and isinstance(field_value[0], bytes))
|
|
168
|
-
):
|
|
169
|
-
continue
|
|
170
|
-
|
|
171
|
-
# set field and value
|
|
172
|
-
new_dr.field_types[field_name] = field_type
|
|
173
|
-
new_dr[field_name] = field_value
|
|
160
|
+
new_dr._passed_operator = self._passed_operator
|
|
174
161
|
|
|
175
162
|
return new_dr
|
|
176
163
|
|
|
177
|
-
|
|
178
164
|
@staticmethod
|
|
179
165
|
def from_parent(
|
|
180
|
-
schema: BaseModel,
|
|
166
|
+
schema: type[BaseModel],
|
|
167
|
+
data_item: dict,
|
|
181
168
|
parent_record: DataRecord,
|
|
182
169
|
project_cols: list[str] | None = None,
|
|
183
170
|
cardinality_idx: int | None = None,
|
|
@@ -194,29 +181,33 @@ class DataRecord:
|
|
|
194
181
|
new_schema = union_schemas([schema, parent_record.schema])
|
|
195
182
|
new_schema = project(new_schema, project_cols)
|
|
196
183
|
|
|
197
|
-
# make new record which has parent_record as its parent (and the same source_indices)
|
|
198
|
-
new_dr = DataRecord(
|
|
199
|
-
new_schema,
|
|
200
|
-
source_indices=parent_record.source_indices,
|
|
201
|
-
parent_ids=[parent_record.id],
|
|
202
|
-
cardinality_idx=cardinality_idx,
|
|
203
|
-
)
|
|
204
|
-
|
|
205
184
|
# get the set of fields and field descriptions to copy from the parent record
|
|
206
|
-
copy_field_names =
|
|
185
|
+
copy_field_names = parent_record.get_field_names() if project_cols is None else project_cols
|
|
207
186
|
copy_field_names = [field.split(".")[-1] for field in copy_field_names]
|
|
208
187
|
|
|
209
188
|
# copy fields from the parent
|
|
210
|
-
for field_name in copy_field_names
|
|
211
|
-
new_dr.field_types[field_name] = parent_record.get_field_type(field_name)
|
|
212
|
-
new_dr[field_name] = parent_record[field_name]
|
|
189
|
+
data_item.update({field_name: parent_record[field_name] for field_name in copy_field_names})
|
|
213
190
|
|
|
214
|
-
|
|
191
|
+
# corner-case: wrap values in lists if the new schema expects a list but the data item has a single value
|
|
192
|
+
for field_name, field_info in new_schema.model_fields.items():
|
|
193
|
+
field_should_be_list = hasattr(field_info.annotation, '__origin__') and field_info.annotation.__origin__ is list
|
|
194
|
+
field_is_not_list = field_name in data_item and not isinstance(data_item[field_name], list)
|
|
195
|
+
if field_should_be_list and field_is_not_list:
|
|
196
|
+
data_item[field_name] = [data_item[field_name]]
|
|
197
|
+
|
|
198
|
+
# make new record which has parent_record as its parent (and the same source_indices)
|
|
199
|
+
new_dr = DataRecord(
|
|
200
|
+
new_schema(**data_item),
|
|
201
|
+
source_indices=parent_record._source_indices,
|
|
202
|
+
parent_ids=[parent_record._id],
|
|
203
|
+
cardinality_idx=cardinality_idx,
|
|
204
|
+
)
|
|
215
205
|
|
|
206
|
+
return new_dr
|
|
216
207
|
|
|
217
208
|
@staticmethod
|
|
218
209
|
def from_agg_parents(
|
|
219
|
-
|
|
210
|
+
data_item: BaseModel,
|
|
220
211
|
parent_records: DataRecordSet,
|
|
221
212
|
cardinality_idx: int | None = None,
|
|
222
213
|
) -> DataRecord:
|
|
@@ -224,33 +215,25 @@ class DataRecord:
|
|
|
224
215
|
source_indices = [
|
|
225
216
|
source_idx
|
|
226
217
|
for parent_record in parent_records
|
|
227
|
-
for source_idx in parent_record.
|
|
218
|
+
for source_idx in parent_record._source_indices
|
|
228
219
|
]
|
|
229
220
|
|
|
230
221
|
# make new record which has all parent records as its parents
|
|
231
222
|
return DataRecord(
|
|
232
|
-
|
|
223
|
+
data_item,
|
|
233
224
|
source_indices=source_indices,
|
|
234
|
-
parent_ids=[parent_record.
|
|
225
|
+
parent_ids=[parent_record._id for parent_record in parent_records],
|
|
235
226
|
cardinality_idx=cardinality_idx,
|
|
236
227
|
)
|
|
237
228
|
|
|
238
229
|
@staticmethod
|
|
239
230
|
def from_join_parents(
|
|
240
|
-
schema: BaseModel,
|
|
231
|
+
schema: type[BaseModel],
|
|
241
232
|
left_parent_record: DataRecord,
|
|
242
233
|
right_parent_record: DataRecord,
|
|
243
234
|
project_cols: list[str] | None = None,
|
|
244
235
|
cardinality_idx: int = None,
|
|
245
236
|
) -> DataRecord:
|
|
246
|
-
# make new record which has left and right parent record as its parents
|
|
247
|
-
new_dr = DataRecord(
|
|
248
|
-
schema,
|
|
249
|
-
source_indices=list(left_parent_record.source_indices) + list(right_parent_record.source_indices),
|
|
250
|
-
parent_ids=[left_parent_record.id, right_parent_record.id],
|
|
251
|
-
cardinality_idx=cardinality_idx,
|
|
252
|
-
)
|
|
253
|
-
|
|
254
237
|
# get the set of fields and field descriptions to copy from the parent record(s)
|
|
255
238
|
left_copy_field_names = (
|
|
256
239
|
left_parent_record.get_field_names()
|
|
@@ -266,23 +249,26 @@ class DataRecord:
|
|
|
266
249
|
right_copy_field_names = [field.split(".")[-1] for field in right_copy_field_names]
|
|
267
250
|
|
|
268
251
|
# copy fields from the parents
|
|
269
|
-
for field_name in left_copy_field_names
|
|
270
|
-
new_dr.field_types[field_name] = left_parent_record.get_field_type(field_name)
|
|
271
|
-
new_dr[field_name] = left_parent_record[field_name]
|
|
272
|
-
|
|
252
|
+
data_item = {field_name: left_parent_record[field_name] for field_name in left_copy_field_names}
|
|
273
253
|
for field_name in right_copy_field_names:
|
|
274
254
|
new_field_name = field_name
|
|
275
255
|
if field_name in left_copy_field_names:
|
|
276
256
|
new_field_name = f"{field_name}_right"
|
|
277
|
-
|
|
278
|
-
new_dr[new_field_name] = right_parent_record[field_name]
|
|
257
|
+
data_item[new_field_name] = right_parent_record[field_name]
|
|
279
258
|
|
|
280
|
-
|
|
259
|
+
# make new record which has left and right parent record as its parents
|
|
260
|
+
new_dr = DataRecord(
|
|
261
|
+
schema(**data_item),
|
|
262
|
+
source_indices=list(left_parent_record._source_indices) + list(right_parent_record._source_indices),
|
|
263
|
+
parent_ids=[left_parent_record._id, right_parent_record._id],
|
|
264
|
+
cardinality_idx=cardinality_idx,
|
|
265
|
+
)
|
|
281
266
|
|
|
267
|
+
return new_dr
|
|
282
268
|
|
|
283
269
|
# TODO: unused outside of unit tests
|
|
284
270
|
@staticmethod
|
|
285
|
-
def from_df(df: pd.DataFrame, schema: BaseModel | None = None) -> list[DataRecord]:
|
|
271
|
+
def from_df(df: pd.DataFrame, schema: type[BaseModel] | None = None) -> list[DataRecord]:
|
|
286
272
|
"""Create a list of DataRecords from a pandas DataFrame
|
|
287
273
|
|
|
288
274
|
Args:
|
|
@@ -309,9 +295,7 @@ class DataRecord:
|
|
|
309
295
|
records = []
|
|
310
296
|
for idx, row in df.iterrows():
|
|
311
297
|
row_dict = row.to_dict()
|
|
312
|
-
record = DataRecord(schema
|
|
313
|
-
record.field_values = row_dict
|
|
314
|
-
record.field_types = {field_name: schema.model_fields[field_name] for field_name in row_dict}
|
|
298
|
+
record = DataRecord(schema(**row_dict), source_indices=[f"{dataset_id}-{idx}"])
|
|
315
299
|
records.append(record)
|
|
316
300
|
|
|
317
301
|
return records
|
|
@@ -346,9 +330,8 @@ class DataRecord:
|
|
|
346
330
|
# TODO(chjun): In case of numpy types, the json.dumps will fail. Convert to native types.
|
|
347
331
|
# Better ways to handle this.
|
|
348
332
|
field_values = {
|
|
349
|
-
k: v.description
|
|
350
|
-
|
|
351
|
-
for k, v in self.field_values.items()
|
|
333
|
+
k: v.description if isinstance(v, context.Context) else v
|
|
334
|
+
for k, v in self._data_item.model_dump().items()
|
|
352
335
|
}
|
|
353
336
|
dct = pd.Series(field_values).to_dict()
|
|
354
337
|
|
|
@@ -358,7 +341,7 @@ class DataRecord:
|
|
|
358
341
|
|
|
359
342
|
if not include_bytes:
|
|
360
343
|
for k in dct:
|
|
361
|
-
field_type = self.
|
|
344
|
+
field_type = self.get_field_type(k)
|
|
362
345
|
if field_type.annotation in [bytes, AudioBase64, ImageBase64, list[bytes], list[ImageBase64]]:
|
|
363
346
|
dct[k] = "<bytes>"
|
|
364
347
|
|
|
@@ -374,11 +357,11 @@ class DataRecord:
|
|
|
374
357
|
|
|
375
358
|
if mask_filepaths:
|
|
376
359
|
for k in dct:
|
|
377
|
-
field_type = self.
|
|
360
|
+
field_type = self.get_field_type(k)
|
|
378
361
|
if field_type.annotation in [AudioBase64, AudioFilepath, ImageBase64, ImageFilepath, ImageURL]:
|
|
379
362
|
dct[k] = "<bytes>"
|
|
380
363
|
|
|
381
|
-
return dct
|
|
364
|
+
return deepcopy(dct)
|
|
382
365
|
|
|
383
366
|
|
|
384
367
|
class DataRecordSet:
|
|
@@ -399,8 +382,8 @@ class DataRecordSet:
|
|
|
399
382
|
# set data_records, parent_ids, and source_indices; note that it is possible for
|
|
400
383
|
# data_records to be an empty list in the event of a failed convert
|
|
401
384
|
self.data_records = data_records
|
|
402
|
-
self.parent_ids = data_records[0].
|
|
403
|
-
self.source_indices = data_records[0].
|
|
385
|
+
self.parent_ids = data_records[0]._parent_ids if len(data_records) > 0 else None
|
|
386
|
+
self.source_indices = data_records[0]._source_indices if len(data_records) > 0 else None
|
|
404
387
|
self.schema = data_records[0].schema if len(data_records) > 0 else None
|
|
405
388
|
|
|
406
389
|
# the input to the operator which produced the data_records; type is tuple[DataRecord] | tuple[int]
|
|
@@ -448,7 +431,6 @@ class DataRecordCollection:
|
|
|
448
431
|
DataRecordSet is used for the output of executing an operator.
|
|
449
432
|
DataRecordCollection is used for the output of executing a query, we definitely could extend it to support more advanced features for output of execute().
|
|
450
433
|
"""
|
|
451
|
-
# TODO(Jun): consider to have stats_manager class to centralize stats management.
|
|
452
434
|
def __init__(self, data_records: list[DataRecord], execution_stats: ExecutionStats | None = None, plan_stats: PlanStats | None = None):
|
|
453
435
|
self.data_records = data_records
|
|
454
436
|
self.execution_stats = execution_stats
|