palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +259 -197
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +634 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +61 -5
- palimpzest/prompts/filter_prompts.py +50 -5
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
- palimpzest/prompts/prompt_factory.py +358 -46
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +157 -330
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +27 -21
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +22 -13
- palimpzest/query/operators/join.py +402 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +198 -80
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +41 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +27 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
- palimpzest-0.8.0.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.20.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
palimpzest/core/lib/schemas.py
CHANGED
|
@@ -1,321 +1,122 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
from typing import Any
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Any, TypeAliasType
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
+
from pydantic import BaseModel, Field, create_model
|
|
8
|
+
from pydantic.fields import FieldInfo
|
|
7
9
|
|
|
8
|
-
from palimpzest.
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
""
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
""
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
attributes = [attr for attr in attributes if not attr.startswith("__")]
|
|
119
|
-
prefix = f"{cls.__name__}.{id}." if unique else ""
|
|
120
|
-
field_map = {prefix + attr: getattr(cls, attr) for attr in attributes if isinstance(getattr(cls, attr), Field)}
|
|
121
|
-
return field_map
|
|
122
|
-
|
|
123
|
-
@classmethod
|
|
124
|
-
def json_schema(cls) -> dict[str, TypingAny]:
|
|
125
|
-
"""The JSON representation of the Schema"""
|
|
126
|
-
fields = cls.field_names()
|
|
127
|
-
|
|
128
|
-
schema = {
|
|
129
|
-
"fields": {},
|
|
130
|
-
"type": "object",
|
|
131
|
-
"description": cls.__doc__,
|
|
132
|
-
}
|
|
133
|
-
for k in fields:
|
|
134
|
-
if k.startswith("_"):
|
|
135
|
-
continue
|
|
136
|
-
v = getattr(cls, k)
|
|
137
|
-
if v is None:
|
|
138
|
-
continue
|
|
139
|
-
|
|
140
|
-
schema["fields"][k] = v.json_schema()
|
|
141
|
-
|
|
142
|
-
return schema
|
|
143
|
-
|
|
144
|
-
@staticmethod
|
|
145
|
-
def field_to_json(field_name: str, field_value: TypingAny) -> TypingAny:
|
|
146
|
-
"""Return a representation of the specified field which will be used in its conversion to JSON"""
|
|
147
|
-
return field_value
|
|
148
|
-
|
|
149
|
-
@classmethod
|
|
150
|
-
def union(cls, other_schema: Schema, keep_duplicates: bool = False) -> Schema:
|
|
151
|
-
"""Return the union of this schema with the other_schema"""
|
|
152
|
-
# construct the new schema name
|
|
153
|
-
schema_name = cls.class_name()
|
|
154
|
-
other_schema_name = other_schema.class_name()
|
|
155
|
-
|
|
156
|
-
# construct new schema description
|
|
157
|
-
new_desc = (
|
|
158
|
-
f"The union of {schema_name} and {other_schema_name}\n\n"
|
|
159
|
-
f"{schema_name}:\n{cls.__doc__}\n\n"
|
|
160
|
-
f"{other_schema_name}:\n{other_schema.__doc__}"
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
# construct new lists of field names, types, and descriptions
|
|
164
|
-
# NOTE: we don't need to use unique field names because they will be injected with an ID at runtime
|
|
165
|
-
new_field_names, new_field_types, new_field_descs = [], [], []
|
|
166
|
-
this_field_map = cls.field_map()
|
|
167
|
-
for field_name, field in this_field_map.items():
|
|
168
|
-
new_field_names.append(field_name)
|
|
169
|
-
new_field_types.append(field)
|
|
170
|
-
new_field_descs.append(field._desc)
|
|
171
|
-
|
|
172
|
-
other_field_map = other_schema.field_map()
|
|
173
|
-
for field_name, field in other_field_map.items():
|
|
174
|
-
new_field_names.append(field_name)
|
|
175
|
-
new_field_types.append(field)
|
|
176
|
-
new_field_descs.append(field._desc)
|
|
177
|
-
|
|
178
|
-
# rename duplicate fields if we are keeping duplicates
|
|
179
|
-
if keep_duplicates:
|
|
180
|
-
dup_new_field_names = []
|
|
181
|
-
for left_idx, left_field_name in enumerate(new_field_names):
|
|
182
|
-
# see if there's a duplicate field name
|
|
183
|
-
matching_field = False
|
|
184
|
-
for right_idx in range(left_idx + 1, len(new_field_names)):
|
|
185
|
-
right_field_name = new_field_names[right_idx]
|
|
186
|
-
if left_field_name == right_field_name:
|
|
187
|
-
matching_field = True
|
|
188
|
-
break
|
|
189
|
-
|
|
190
|
-
# if theres a matching field, add them both with their schema names
|
|
191
|
-
if matching_field:
|
|
192
|
-
dup_new_field_names.append(schema_name + "_" + left_field_name)
|
|
193
|
-
dup_new_field_names.append(other_schema_name + "_" + left_field_name)
|
|
194
|
-
else:
|
|
195
|
-
dup_new_field_names.append(left_field_name)
|
|
196
|
-
|
|
197
|
-
# update new_field_names
|
|
198
|
-
new_field_names = dup_new_field_names
|
|
199
|
-
|
|
200
|
-
# Generate the schema class dynamically
|
|
201
|
-
attributes = {"_desc": new_desc, "__doc__": new_desc}
|
|
202
|
-
for field_name, field_type, field_desc in zip(new_field_names, new_field_types, new_field_descs):
|
|
203
|
-
attributes[field_name] = field_type.__class__(desc=field_desc)
|
|
204
|
-
|
|
205
|
-
# compute the name for the new schema
|
|
206
|
-
new_schema_name = f"Schema[{sorted(new_field_names)}]"
|
|
207
|
-
|
|
208
|
-
# Create the class dynamically
|
|
209
|
-
return type(new_schema_name, (Schema,), attributes)
|
|
210
|
-
|
|
211
|
-
@classmethod
|
|
212
|
-
def project(cls, project_cols: list[str]) -> Schema:
|
|
213
|
-
"""Return a projection of this schema with only the project_cols"""
|
|
214
|
-
# construct the new schema name
|
|
215
|
-
schema_name = cls.class_name()
|
|
216
|
-
|
|
217
|
-
# construct new schema description
|
|
218
|
-
new_desc = f"A projection of {schema_name} which only contains the fields {project_cols}"
|
|
219
|
-
|
|
220
|
-
# make sure projection column names are shortened
|
|
221
|
-
project_cols = [field_name.split(".")[-1] for field_name in project_cols]
|
|
222
|
-
|
|
223
|
-
# construct new lists of field names, types, and descriptions
|
|
224
|
-
# NOTE: we don't need to use unique field names because they will be injected with an ID at runtime
|
|
225
|
-
new_field_names, new_field_types, new_field_descs = [], [], []
|
|
226
|
-
for field_name, field in cls.field_map().items():
|
|
227
|
-
if field_name in project_cols:
|
|
228
|
-
new_field_names.append(field_name)
|
|
229
|
-
new_field_types.append(field)
|
|
230
|
-
new_field_descs.append(field._desc)
|
|
231
|
-
|
|
232
|
-
# Generate the schema class dynamically
|
|
233
|
-
attributes = {"_desc": new_desc, "__doc__": new_desc}
|
|
234
|
-
for field_name, field_type, field_desc in zip(new_field_names, new_field_types, new_field_descs):
|
|
235
|
-
attributes[field_name] = field_type.__class__(desc=field_desc)
|
|
236
|
-
|
|
237
|
-
# compute the name for the new schema
|
|
238
|
-
new_schema_name = f"Schema[{sorted(new_field_names)}]"
|
|
239
|
-
|
|
240
|
-
# Create the class dynamically
|
|
241
|
-
return type(new_schema_name, (Schema,), attributes)
|
|
242
|
-
|
|
243
|
-
@staticmethod
|
|
244
|
-
def from_df(df: pd.DataFrame) -> Schema:
|
|
245
|
-
# get new field names, types, and descriptions
|
|
246
|
-
new_field_names, new_field_types, new_field_descs = [], [], []
|
|
247
|
-
for column, dtype in zip(df.columns, df.dtypes):
|
|
248
|
-
column = f"column_{column}" if isinstance(column, int) else column
|
|
249
|
-
field_desc = f"The {column} column from an input DataFrame"
|
|
250
|
-
if dtype == "object":
|
|
251
|
-
new_field_types.append(StringField(desc=field_desc))
|
|
252
|
-
elif dtype == "bool":
|
|
253
|
-
new_field_types.append(BooleanField(desc=field_desc))
|
|
254
|
-
elif dtype == "int64":
|
|
255
|
-
new_field_types.append(IntField(desc=field_desc))
|
|
256
|
-
elif dtype == "float64":
|
|
257
|
-
new_field_types.append(FloatField(desc=field_desc))
|
|
258
|
-
else:
|
|
259
|
-
new_field_types.append(Field(desc=field_desc))
|
|
260
|
-
|
|
261
|
-
new_field_names.append(column)
|
|
262
|
-
new_field_descs.append(field_desc)
|
|
263
|
-
|
|
264
|
-
# Generate the schema class dynamically
|
|
265
|
-
desc = "Schema derived from DataFrame"
|
|
266
|
-
attributes = {"_desc": desc, "__doc__": desc, "__module__": Schema.__module__}
|
|
267
|
-
for field_name, field_type, field_desc in zip(new_field_names, new_field_types, new_field_descs):
|
|
268
|
-
attributes[field_name] = field_type.__class__(desc=field_desc)
|
|
269
|
-
|
|
270
|
-
# compute the name for the new schema
|
|
271
|
-
new_schema_name = f"Schema[{sorted(new_field_names)}]"
|
|
272
|
-
|
|
273
|
-
# create and return the schema
|
|
274
|
-
return type(new_schema_name, (Schema,), attributes)
|
|
275
|
-
|
|
276
|
-
@classmethod
|
|
277
|
-
def from_json(cls, fields: list[dict]) -> Schema:
|
|
278
|
-
return cls.add_fields(fields)
|
|
279
|
-
|
|
280
|
-
@classmethod
|
|
281
|
-
def add_fields(cls, fields: list[dict]) -> Schema:
|
|
282
|
-
"""Add fields to the schema
|
|
283
|
-
|
|
284
|
-
Args:
|
|
285
|
-
fields: List of dictionaries, each containing 'name', 'desc', and 'type' keys
|
|
286
|
-
|
|
287
|
-
Returns:
|
|
288
|
-
A new Schema with the additional fields
|
|
289
|
-
"""
|
|
290
|
-
assert isinstance(fields, list), "fields must be a list of dictionaries"
|
|
291
|
-
for field in fields:
|
|
292
|
-
assert "name" in field, "fields must contain a 'name' key"
|
|
293
|
-
assert "desc" in field, "fields must contain a 'desc' key"
|
|
294
|
-
assert "type" in field, "fields must contain a 'type' key"
|
|
295
|
-
|
|
296
|
-
# build up field names, descriptions, and types
|
|
297
|
-
new_field_names = [field["name"] for field in fields]
|
|
298
|
-
new_field_objs = [
|
|
299
|
-
construct_field_type(field["type"], desc=field["desc"])
|
|
300
|
-
for field in fields
|
|
301
|
-
]
|
|
302
|
-
|
|
303
|
-
# construct new schema
|
|
304
|
-
new_desc = f"Added fields to {cls.__name__}"
|
|
305
|
-
attributes = {"_desc": new_desc, "__doc__": new_desc}
|
|
306
|
-
for field_name, field_obj in zip(new_field_names, new_field_objs):
|
|
307
|
-
attributes[field_name] = field_obj
|
|
308
|
-
|
|
309
|
-
new_output_schema = type(f"{cls.__name__}Extended", (Schema,), attributes)
|
|
310
|
-
|
|
311
|
-
# return the union of this new schema with the cls
|
|
312
|
-
return cls.union(new_output_schema)
|
|
313
|
-
|
|
314
|
-
@classmethod
|
|
315
|
-
def class_name(cls) -> str:
|
|
316
|
-
"""Return the name of this class"""
|
|
317
|
-
return cls.__name__
|
|
318
|
-
|
|
10
|
+
from palimpzest.utils.hash_helpers import hash_for_serialized_dict
|
|
11
|
+
|
|
12
|
+
# DEFINITIONS
|
|
13
|
+
PANDAS_DTYPE_TO_PYDANTIC = {
|
|
14
|
+
"object": str,
|
|
15
|
+
"bool": bool,
|
|
16
|
+
"int64": int,
|
|
17
|
+
"float64": float,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# IMAGE TYPES
|
|
21
|
+
ImageFilepath = TypeAliasType('ImageFilepath', str)
|
|
22
|
+
ImageBase64 = TypeAliasType('ImageBase64', str)
|
|
23
|
+
ImageURL = TypeAliasType('ImageURL', str)
|
|
24
|
+
|
|
25
|
+
# AUDIO TYPES
|
|
26
|
+
AudioFilepath = TypeAliasType('AudioFilepath', str)
|
|
27
|
+
AudioBase64 = TypeAliasType('AudioBase64', str)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_schema_field_names(schema: type[BaseModel], id: str | None = None) -> list[str]:
|
|
31
|
+
"""Return the field names of a Pydantic model."""
|
|
32
|
+
return list(schema.model_fields) if id is None else [f"{schema.__name__}.{id}.{field_name}" for field_name in schema.model_fields]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _create_pickleable_model(fields: dict[str, tuple[type, FieldInfo]]) -> type[BaseModel]:
|
|
36
|
+
"""Create a Pydantic model that can be pickled."""
|
|
37
|
+
# create unique name for the unioned model
|
|
38
|
+
new_schema_name = f"Schema{sorted(fields.keys())}"
|
|
39
|
+
new_schema_id = hash_for_serialized_dict({
|
|
40
|
+
field_name: {"annotation": str(annotation), "default": str(field.default), "description": field.description}
|
|
41
|
+
for field_name, (annotation, field) in fields.items()
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
# if this class already exists, get it from the module and return
|
|
45
|
+
module = sys.modules[__name__]
|
|
46
|
+
if hasattr(module, new_schema_id):
|
|
47
|
+
return getattr(module, new_schema_id)
|
|
48
|
+
|
|
49
|
+
# create the class dynamically
|
|
50
|
+
new_model = create_model(new_schema_name, **fields)
|
|
51
|
+
|
|
52
|
+
# register it in the module's namespace so pickle can find it
|
|
53
|
+
module = sys.modules[__name__]
|
|
54
|
+
setattr(module, new_schema_id, new_model)
|
|
55
|
+
new_model.__module__ = module.__name__
|
|
56
|
+
|
|
57
|
+
return new_model
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def project(model: type[BaseModel], project_fields: list[str]) -> type[BaseModel]:
|
|
61
|
+
"""Project a Pydantic model to only the specified columns."""
|
|
62
|
+
# make sure projection column names are shortened
|
|
63
|
+
project_fields = [field_name.split(".")[-1] for field_name in project_fields]
|
|
64
|
+
|
|
65
|
+
# build up the fields for the new schema
|
|
66
|
+
fields = {}
|
|
67
|
+
for field_name, field in model.model_fields.items():
|
|
68
|
+
if field_name in project_fields:
|
|
69
|
+
fields[field_name] = (field.annotation, field)
|
|
70
|
+
|
|
71
|
+
# create and return the new schema
|
|
72
|
+
return _create_pickleable_model(fields)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def create_schema_from_fields(fields: list[dict]) -> type[BaseModel]:
|
|
76
|
+
"""Create a Pydantic model from a list of fields."""
|
|
77
|
+
fields_ = {}
|
|
78
|
+
for field in fields:
|
|
79
|
+
assert "name" in field, "fields must contain a 'name' key"
|
|
80
|
+
assert "type" in field, "fields must contain a 'type' key"
|
|
81
|
+
assert "desc" in field or "description" in field, "fields must contain a 'description' key"
|
|
82
|
+
|
|
83
|
+
# for backwards compatability, rename "desc" to "description"
|
|
84
|
+
if "desc" in field:
|
|
85
|
+
field["description"] = field.pop("desc")
|
|
86
|
+
field_name = field["name"]
|
|
87
|
+
field_type = field["type"]
|
|
88
|
+
fields_[field_name] = (field_type, Field(**{k: v for k, v in field.items() if k not in ["name", "type"]}))
|
|
89
|
+
|
|
90
|
+
return _create_pickleable_model(fields_)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def create_schema_from_df(df: pd.DataFrame) -> type[BaseModel]:
|
|
94
|
+
"""Create a Pydantic model from a Pandas DataFrame."""
|
|
95
|
+
fields = {}
|
|
96
|
+
for column, dtype in zip(df.columns, df.dtypes):
|
|
97
|
+
column = f"column_{column}" if isinstance(column, int) else column
|
|
98
|
+
field_desc = f"The {column} column from an input DataFrame"
|
|
99
|
+
annotation = PANDAS_DTYPE_TO_PYDANTIC.get(str(dtype), Any)
|
|
100
|
+
fields[column] = (annotation, Field(description=field_desc))
|
|
101
|
+
|
|
102
|
+
# create and return the new schema
|
|
103
|
+
return _create_pickleable_model(fields)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def union_schemas(models: list[type[BaseModel]], join: bool = False) -> type[BaseModel]:
|
|
107
|
+
"""Union multiple Pydantic models into a single model."""
|
|
108
|
+
fields = {}
|
|
109
|
+
for model in models:
|
|
110
|
+
for field_name, field in model.model_fields.items():
|
|
111
|
+
if field_name in fields and not join:
|
|
112
|
+
assert fields[field_name][0] == field.annotation, f"Field {field_name} has different types in different models"
|
|
113
|
+
elif field_name in fields and join:
|
|
114
|
+
while field_name in fields:
|
|
115
|
+
field_name = f"{field_name}_right"
|
|
116
|
+
fields[field_name] = (field.annotation, field)
|
|
117
|
+
|
|
118
|
+
# create and return the new schema
|
|
119
|
+
return _create_pickleable_model(fields)
|
|
319
120
|
|
|
320
121
|
###################################################################################
|
|
321
122
|
# "Core" useful Schemas. These are Schemas that almost everyone will need.
|
|
@@ -324,120 +125,81 @@ class Schema(metaclass=SchemaMetaclass):
|
|
|
324
125
|
|
|
325
126
|
|
|
326
127
|
# First-level Schema's
|
|
327
|
-
class DefaultSchema(
|
|
128
|
+
class DefaultSchema(BaseModel):
|
|
328
129
|
"""Store context data."""
|
|
130
|
+
value: Any = Field(description="The value of the input data")
|
|
329
131
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
class Download(Schema):
|
|
132
|
+
class Download(BaseModel):
|
|
334
133
|
"""A download is a URL and the contents of the download."""
|
|
134
|
+
url: str = Field(description="The URL of the download")
|
|
135
|
+
content: bytes = Field(description="The contents of the download")
|
|
136
|
+
timestamp: str = Field(description="The timestamp of the download")
|
|
335
137
|
|
|
336
|
-
|
|
337
|
-
content = BytesField(desc="The contents of the download")
|
|
338
|
-
timestamp = StringField(desc="The timestamp of the download")
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
class File(Schema):
|
|
138
|
+
class File(BaseModel):
|
|
342
139
|
"""
|
|
343
140
|
A File is defined by two Fields:
|
|
344
141
|
- the filename (string)
|
|
345
142
|
- the contents of the file (bytes)
|
|
346
143
|
"""
|
|
144
|
+
filename: str = Field(description="The UNIX-style name of the file")
|
|
145
|
+
contents: bytes = Field(description="The contents of the file")
|
|
347
146
|
|
|
348
|
-
|
|
349
|
-
contents = BytesField(desc="The contents of the file")
|
|
350
|
-
|
|
351
|
-
class TextFile(Schema):
|
|
147
|
+
class TextFile(BaseModel):
|
|
352
148
|
"""A text file is a File that contains only text. No binary data."""
|
|
353
|
-
filename =
|
|
354
|
-
contents =
|
|
149
|
+
filename: str = Field(description="The UNIX-style name of the file")
|
|
150
|
+
contents: str = Field(description="The contents of the file")
|
|
355
151
|
|
|
356
|
-
class
|
|
357
|
-
"
|
|
152
|
+
class Average(BaseModel):
|
|
153
|
+
average: float = Field(description="The average value of items in the dataset")
|
|
358
154
|
|
|
359
|
-
|
|
155
|
+
class Count(BaseModel):
|
|
156
|
+
count: int = Field(description="The count of items in the dataset")
|
|
360
157
|
|
|
361
|
-
|
|
362
|
-
class OperatorDerivedSchema(Schema):
|
|
158
|
+
class OperatorDerivedSchema(BaseModel):
|
|
363
159
|
"""Schema defined by an operator, e.g., a join or a group by"""
|
|
364
160
|
|
|
365
|
-
|
|
366
|
-
class RawJSONObject(Schema):
|
|
367
|
-
"""A JSON object, which is a dictionary of key-value pairs."""
|
|
368
|
-
|
|
369
|
-
json = StringField(desc="String representation of a JSON object")
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
list_of_strings = ListField(StringField)
|
|
373
|
-
list_of_lists = ListField(ListField)
|
|
374
|
-
class Table(Schema):
|
|
161
|
+
class Table(BaseModel):
|
|
375
162
|
"""A Table is an object composed of a header and rows."""
|
|
163
|
+
filename: str = Field(description="The name of the file the table was extracted from")
|
|
164
|
+
name: str = Field(description="The name of the table")
|
|
165
|
+
header: list[str] = Field(description="The header of the table")
|
|
166
|
+
rows: list[list] = Field(description="The rows of the table")
|
|
376
167
|
|
|
377
|
-
|
|
378
|
-
name = StringField(desc="The name of the table")
|
|
379
|
-
header = list_of_strings(desc="The header of the table")
|
|
380
|
-
# TODO currently no support for nesting data records on data records
|
|
381
|
-
rows = list_of_lists(desc="The rows of the table")
|
|
382
|
-
|
|
383
|
-
def field_to_json(self, field_name: str, field_value: TypingAny) -> TypingAny:
|
|
384
|
-
"""Return a truncated JSON representation for `rows` and a string representation for `header`"""
|
|
385
|
-
# take the first MAX_ROWS rows in the record_dict and turn them into comma separated strings
|
|
386
|
-
if field_name == "rows":
|
|
387
|
-
return [",".join(map(str, row)) + "\n" for row in field_value[:MAX_ROWS]]
|
|
388
|
-
|
|
389
|
-
elif field_name == "header":
|
|
390
|
-
return ",".join(field_value)
|
|
391
|
-
|
|
392
|
-
return field_value
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
class URL(Schema):
|
|
168
|
+
class URL(BaseModel):
|
|
396
169
|
"""A URL is a string that represents a web address."""
|
|
170
|
+
url: str = Field(description="A URL")
|
|
397
171
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
class WebPage(Schema):
|
|
172
|
+
class WebPage(BaseModel):
|
|
402
173
|
"""A web page is a URL and the contents of the page."""
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
timestamp = StringField(desc="The timestamp of the download")
|
|
408
|
-
filename = StringField(desc="The name of the file the web page was downloaded from")
|
|
409
|
-
|
|
174
|
+
text: str = Field(description="The text contents of the web page")
|
|
175
|
+
html: str = Field(description="The html contents of the web page")
|
|
176
|
+
timestamp: str = Field(description="The timestamp of the download")
|
|
177
|
+
filename: str = Field(description="The name of the file the web page was downloaded from")
|
|
410
178
|
|
|
411
179
|
# Second-level Schemas
|
|
412
180
|
class ImageFile(File):
|
|
413
181
|
"""A file that contains an image."""
|
|
182
|
+
contents: ImageBase64 = Field(description="The contents of the image encoded as a base64 string")
|
|
414
183
|
|
|
415
|
-
|
|
416
|
-
|
|
184
|
+
class AudioFile(File):
|
|
185
|
+
"""A file that contains audio."""
|
|
186
|
+
contents: AudioBase64 = Field(description="The contents of an audio recording encoded as a base64 string")
|
|
417
187
|
|
|
418
188
|
class PDFFile(File):
|
|
419
189
|
"""A PDF file is a File that is a PDF. It has specialized fields, font information, etc."""
|
|
420
|
-
|
|
421
190
|
# This class is currently very impoverished. It needs a lot more fields before it can correctly represent a PDF.
|
|
422
|
-
text_contents =
|
|
191
|
+
text_contents: str = Field(description="The text-only contents of the PDF")
|
|
423
192
|
|
|
424
|
-
|
|
425
|
-
list_of_numbers = ListField(NumericField)
|
|
426
193
|
class XLSFile(File):
|
|
427
194
|
"""An XLS file is a File that contains one or more Excel spreadsheets."""
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
sheet_names = list_of_numbers(desc="The names of the sheets in the Excel file")
|
|
431
|
-
|
|
195
|
+
number_sheets: int = Field(description="The number of sheets in the Excel file")
|
|
196
|
+
sheet_names: list[str] = Field(description="The names of the sheets in the Excel file")
|
|
432
197
|
|
|
433
198
|
# Third-level Schemas
|
|
434
199
|
class EquationImage(ImageFile):
|
|
435
200
|
"""An image that contains a mathematical equation."""
|
|
436
|
-
|
|
437
|
-
equation_text = StringField(desc="The text representation of the equation in the image")
|
|
438
|
-
|
|
201
|
+
equation_text: str = Field(description="The text representation of the equation in the image")
|
|
439
202
|
|
|
440
203
|
class PlotImage(ImageFile):
|
|
441
204
|
"""An image that contains a plot, such as a graph or chart."""
|
|
442
|
-
|
|
443
|
-
plot_description = StringField(desc="A description of the plot")
|
|
205
|
+
plot_description: str = Field(description="A description of the plot")
|