palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +259 -197
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +634 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +61 -5
  19. palimpzest/prompts/filter_prompts.py +50 -5
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
  22. palimpzest/prompts/prompt_factory.py +358 -46
  23. palimpzest/prompts/validator.py +239 -0
  24. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  25. palimpzest/query/execution/execution_strategy.py +210 -317
  26. palimpzest/query/execution/execution_strategy_type.py +5 -7
  27. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  28. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  29. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  30. palimpzest/query/generators/generators.py +157 -330
  31. palimpzest/query/operators/__init__.py +15 -5
  32. palimpzest/query/operators/aggregate.py +50 -33
  33. palimpzest/query/operators/compute.py +201 -0
  34. palimpzest/query/operators/convert.py +27 -21
  35. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  36. palimpzest/query/operators/distinct.py +62 -0
  37. palimpzest/query/operators/filter.py +22 -13
  38. palimpzest/query/operators/join.py +402 -0
  39. palimpzest/query/operators/limit.py +3 -3
  40. palimpzest/query/operators/logical.py +198 -80
  41. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  42. palimpzest/query/operators/physical.py +27 -21
  43. palimpzest/query/operators/project.py +3 -3
  44. palimpzest/query/operators/rag_convert.py +7 -7
  45. palimpzest/query/operators/retrieve.py +9 -9
  46. palimpzest/query/operators/scan.py +81 -42
  47. palimpzest/query/operators/search.py +524 -0
  48. palimpzest/query/operators/split_convert.py +10 -8
  49. palimpzest/query/optimizer/__init__.py +7 -9
  50. palimpzest/query/optimizer/cost_model.py +108 -441
  51. palimpzest/query/optimizer/optimizer.py +123 -181
  52. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  53. palimpzest/query/optimizer/plan.py +352 -67
  54. palimpzest/query/optimizer/primitives.py +43 -19
  55. palimpzest/query/optimizer/rules.py +484 -646
  56. palimpzest/query/optimizer/tasks.py +127 -58
  57. palimpzest/query/processor/config.py +41 -76
  58. palimpzest/query/processor/query_processor.py +73 -18
  59. palimpzest/query/processor/query_processor_factory.py +46 -38
  60. palimpzest/schemabuilder/schema_builder.py +15 -28
  61. palimpzest/utils/model_helpers.py +27 -77
  62. palimpzest/utils/progress.py +114 -102
  63. palimpzest/validator/__init__.py +0 -0
  64. palimpzest/validator/validator.py +306 -0
  65. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
  66. palimpzest-0.8.0.dist-info/RECORD +95 -0
  67. palimpzest/core/lib/fields.py +0 -141
  68. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  69. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  70. palimpzest/query/generators/api_client_factory.py +0 -30
  71. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  72. palimpzest/query/operators/map.py +0 -130
  73. palimpzest/query/processor/nosentinel_processor.py +0 -33
  74. palimpzest/query/processor/processing_strategy_type.py +0 -28
  75. palimpzest/query/processor/sentinel_processor.py +0 -88
  76. palimpzest/query/processor/streaming_processor.py +0 -149
  77. palimpzest/sets.py +0 -405
  78. palimpzest/utils/datareader_helpers.py +0 -61
  79. palimpzest/utils/demo_helpers.py +0 -75
  80. palimpzest/utils/field_helpers.py +0 -69
  81. palimpzest/utils/generation_helpers.py +0 -69
  82. palimpzest/utils/sandbox.py +0 -183
  83. palimpzest-0.7.20.dist-info/RECORD +0 -95
  84. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  85. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
  86. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
  87. {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,321 +1,122 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
- from typing import Any as TypingAny
3
+ import sys
4
+ from typing import Any, TypeAliasType
5
5
 
6
6
  import pandas as pd
7
+ from pydantic import BaseModel, Field, create_model
8
+ from pydantic.fields import FieldInfo
7
9
 
8
- from palimpzest.constants import MAX_ROWS
9
- from palimpzest.core.lib.fields import (
10
- BooleanField,
11
- BytesField,
12
- Field,
13
- FloatField,
14
- ImageBase64Field,
15
- IntField,
16
- ListField,
17
- NumericField,
18
- StringField,
19
- )
20
- from palimpzest.utils.field_helpers import construct_field_type
21
-
22
-
23
- class SchemaMetaclass(type):
24
- """
25
- This is a metaclass for our Schema class.
26
- """
27
-
28
- def __eq__(cls, other) -> bool:
29
- """
30
- Equality function for the Schema which checks that the ordered fields and class names are the same.
31
- """
32
- return cls.get_desc() == other.get_desc()
33
-
34
-
35
- class Schema(metaclass=SchemaMetaclass):
36
- """
37
- A Schema is defined by a set of named Fields. Much of the class is implemented in the SchemaMetaclass.
38
- Because Schema is a MetaClass, its fields are defined similar to how they are defined in a Python dataclass.
39
-
40
- For example, if you wanted to define a schema for research papers, you could define a schema
41
- with fields representing the paper's title, publication year, and publishing journal:
42
-
43
- ```python
44
- class ResearchPaper(Schema):
45
- paper_title = Field(desc="The title of a scientific paper")
46
- paper_year = Field(desc="The year the paper was published")
47
- paper_journal = Field(desc="The name of the journal that published the paper")
48
- ```
49
- """
50
-
51
- def __init__(self, desc: str | None = None):
52
- self._desc = "" if desc is None else desc
53
-
54
- def __str__(self) -> str:
55
- return f"{self.__class__.__name__}(desc={self._desc})"
56
-
57
- # TODO: after eliminating the metaclass, this does not work
58
- def __eq__(self, other) -> bool:
59
- """
60
- Equality function for the Schema which checks that the ordered fields and class names are the same.
61
- """
62
- schema = self.get_desc()
63
- other_schema = other.get_desc()
64
-
65
- return schema == other_schema
66
-
67
- def __hash__(self) -> int:
68
- """Hash function for the Schema which is a simple hash of its ordered Fields and class name."""
69
- ordered = self.get_desc()
70
-
71
- return hash(ordered.encode())
72
-
73
- @classmethod
74
- def get_desc(cls) -> str:
75
- """Return a description of the schema"""
76
- fields = cls.field_names()
77
- d = {k: hash(getattr(cls, k)) for k in fields}
78
- d["__class__"] = cls.class_name()
79
-
80
- return json.dumps(d, sort_keys=True)
81
-
82
- @classmethod
83
- def field_names(cls, unique=False, id="") -> list[str]:
84
- """
85
- Return a list of the fields in this Schema. The `unique` argument is used to determine if the
86
- class name should be prefixed to the field name for unique identification. The `id` argument is
87
- used to provide a unique identifier for the class name.
88
- """
89
- attributes = dir(cls)
90
- attributes = [attr for attr in attributes if not attr.startswith("__")]
91
- prefix = f"{cls.__name__}.{id}." if unique else ""
92
- fields = [prefix + attr for attr in attributes if isinstance(getattr(cls, attr), Field)]
93
- return fields
94
-
95
- @classmethod
96
- def field_desc_map(cls, unique=False, id="") -> dict[str, str]:
97
- """
98
- Return a mapping from field names to their descriptions. The `unique` argument is used to determine if the
99
- class name should be prefixed to the field name for unique identification. The `id` argument is
100
- used to provide a unique identifier for the class name.
101
- """
102
- attributes = dir(cls)
103
- attributes = [attr for attr in attributes if not attr.startswith("__")]
104
- prefix = f"{cls.__name__}.{id}." if unique else ""
105
- field_desc_map = {
106
- prefix + attr: getattr(cls, attr)._desc for attr in attributes if isinstance(getattr(cls, attr), Field)
107
- }
108
- return field_desc_map
109
-
110
- @classmethod
111
- def field_map(cls, unique=False, id="") -> dict[str, Field]:
112
- """
113
- Return a mapping from field names to their field types. The `unique` argument is used to determine if the
114
- class name should be prefixed to the field name for unique identification. The `id` argument is used to
115
- provide a unique identifier for the class name.
116
- """
117
- attributes = dir(cls)
118
- attributes = [attr for attr in attributes if not attr.startswith("__")]
119
- prefix = f"{cls.__name__}.{id}." if unique else ""
120
- field_map = {prefix + attr: getattr(cls, attr) for attr in attributes if isinstance(getattr(cls, attr), Field)}
121
- return field_map
122
-
123
- @classmethod
124
- def json_schema(cls) -> dict[str, TypingAny]:
125
- """The JSON representation of the Schema"""
126
- fields = cls.field_names()
127
-
128
- schema = {
129
- "fields": {},
130
- "type": "object",
131
- "description": cls.__doc__,
132
- }
133
- for k in fields:
134
- if k.startswith("_"):
135
- continue
136
- v = getattr(cls, k)
137
- if v is None:
138
- continue
139
-
140
- schema["fields"][k] = v.json_schema()
141
-
142
- return schema
143
-
144
- @staticmethod
145
- def field_to_json(field_name: str, field_value: TypingAny) -> TypingAny:
146
- """Return a representation of the specified field which will be used in its conversion to JSON"""
147
- return field_value
148
-
149
- @classmethod
150
- def union(cls, other_schema: Schema, keep_duplicates: bool = False) -> Schema:
151
- """Return the union of this schema with the other_schema"""
152
- # construct the new schema name
153
- schema_name = cls.class_name()
154
- other_schema_name = other_schema.class_name()
155
-
156
- # construct new schema description
157
- new_desc = (
158
- f"The union of {schema_name} and {other_schema_name}\n\n"
159
- f"{schema_name}:\n{cls.__doc__}\n\n"
160
- f"{other_schema_name}:\n{other_schema.__doc__}"
161
- )
162
-
163
- # construct new lists of field names, types, and descriptions
164
- # NOTE: we don't need to use unique field names because they will be injected with an ID at runtime
165
- new_field_names, new_field_types, new_field_descs = [], [], []
166
- this_field_map = cls.field_map()
167
- for field_name, field in this_field_map.items():
168
- new_field_names.append(field_name)
169
- new_field_types.append(field)
170
- new_field_descs.append(field._desc)
171
-
172
- other_field_map = other_schema.field_map()
173
- for field_name, field in other_field_map.items():
174
- new_field_names.append(field_name)
175
- new_field_types.append(field)
176
- new_field_descs.append(field._desc)
177
-
178
- # rename duplicate fields if we are keeping duplicates
179
- if keep_duplicates:
180
- dup_new_field_names = []
181
- for left_idx, left_field_name in enumerate(new_field_names):
182
- # see if there's a duplicate field name
183
- matching_field = False
184
- for right_idx in range(left_idx + 1, len(new_field_names)):
185
- right_field_name = new_field_names[right_idx]
186
- if left_field_name == right_field_name:
187
- matching_field = True
188
- break
189
-
190
- # if theres a matching field, add them both with their schema names
191
- if matching_field:
192
- dup_new_field_names.append(schema_name + "_" + left_field_name)
193
- dup_new_field_names.append(other_schema_name + "_" + left_field_name)
194
- else:
195
- dup_new_field_names.append(left_field_name)
196
-
197
- # update new_field_names
198
- new_field_names = dup_new_field_names
199
-
200
- # Generate the schema class dynamically
201
- attributes = {"_desc": new_desc, "__doc__": new_desc}
202
- for field_name, field_type, field_desc in zip(new_field_names, new_field_types, new_field_descs):
203
- attributes[field_name] = field_type.__class__(desc=field_desc)
204
-
205
- # compute the name for the new schema
206
- new_schema_name = f"Schema[{sorted(new_field_names)}]"
207
-
208
- # Create the class dynamically
209
- return type(new_schema_name, (Schema,), attributes)
210
-
211
- @classmethod
212
- def project(cls, project_cols: list[str]) -> Schema:
213
- """Return a projection of this schema with only the project_cols"""
214
- # construct the new schema name
215
- schema_name = cls.class_name()
216
-
217
- # construct new schema description
218
- new_desc = f"A projection of {schema_name} which only contains the fields {project_cols}"
219
-
220
- # make sure projection column names are shortened
221
- project_cols = [field_name.split(".")[-1] for field_name in project_cols]
222
-
223
- # construct new lists of field names, types, and descriptions
224
- # NOTE: we don't need to use unique field names because they will be injected with an ID at runtime
225
- new_field_names, new_field_types, new_field_descs = [], [], []
226
- for field_name, field in cls.field_map().items():
227
- if field_name in project_cols:
228
- new_field_names.append(field_name)
229
- new_field_types.append(field)
230
- new_field_descs.append(field._desc)
231
-
232
- # Generate the schema class dynamically
233
- attributes = {"_desc": new_desc, "__doc__": new_desc}
234
- for field_name, field_type, field_desc in zip(new_field_names, new_field_types, new_field_descs):
235
- attributes[field_name] = field_type.__class__(desc=field_desc)
236
-
237
- # compute the name for the new schema
238
- new_schema_name = f"Schema[{sorted(new_field_names)}]"
239
-
240
- # Create the class dynamically
241
- return type(new_schema_name, (Schema,), attributes)
242
-
243
- @staticmethod
244
- def from_df(df: pd.DataFrame) -> Schema:
245
- # get new field names, types, and descriptions
246
- new_field_names, new_field_types, new_field_descs = [], [], []
247
- for column, dtype in zip(df.columns, df.dtypes):
248
- column = f"column_{column}" if isinstance(column, int) else column
249
- field_desc = f"The {column} column from an input DataFrame"
250
- if dtype == "object":
251
- new_field_types.append(StringField(desc=field_desc))
252
- elif dtype == "bool":
253
- new_field_types.append(BooleanField(desc=field_desc))
254
- elif dtype == "int64":
255
- new_field_types.append(IntField(desc=field_desc))
256
- elif dtype == "float64":
257
- new_field_types.append(FloatField(desc=field_desc))
258
- else:
259
- new_field_types.append(Field(desc=field_desc))
260
-
261
- new_field_names.append(column)
262
- new_field_descs.append(field_desc)
263
-
264
- # Generate the schema class dynamically
265
- desc = "Schema derived from DataFrame"
266
- attributes = {"_desc": desc, "__doc__": desc, "__module__": Schema.__module__}
267
- for field_name, field_type, field_desc in zip(new_field_names, new_field_types, new_field_descs):
268
- attributes[field_name] = field_type.__class__(desc=field_desc)
269
-
270
- # compute the name for the new schema
271
- new_schema_name = f"Schema[{sorted(new_field_names)}]"
272
-
273
- # create and return the schema
274
- return type(new_schema_name, (Schema,), attributes)
275
-
276
- @classmethod
277
- def from_json(cls, fields: list[dict]) -> Schema:
278
- return cls.add_fields(fields)
279
-
280
- @classmethod
281
- def add_fields(cls, fields: list[dict]) -> Schema:
282
- """Add fields to the schema
283
-
284
- Args:
285
- fields: List of dictionaries, each containing 'name', 'desc', and 'type' keys
286
-
287
- Returns:
288
- A new Schema with the additional fields
289
- """
290
- assert isinstance(fields, list), "fields must be a list of dictionaries"
291
- for field in fields:
292
- assert "name" in field, "fields must contain a 'name' key"
293
- assert "desc" in field, "fields must contain a 'desc' key"
294
- assert "type" in field, "fields must contain a 'type' key"
295
-
296
- # build up field names, descriptions, and types
297
- new_field_names = [field["name"] for field in fields]
298
- new_field_objs = [
299
- construct_field_type(field["type"], desc=field["desc"])
300
- for field in fields
301
- ]
302
-
303
- # construct new schema
304
- new_desc = f"Added fields to {cls.__name__}"
305
- attributes = {"_desc": new_desc, "__doc__": new_desc}
306
- for field_name, field_obj in zip(new_field_names, new_field_objs):
307
- attributes[field_name] = field_obj
308
-
309
- new_output_schema = type(f"{cls.__name__}Extended", (Schema,), attributes)
310
-
311
- # return the union of this new schema with the cls
312
- return cls.union(new_output_schema)
313
-
314
- @classmethod
315
- def class_name(cls) -> str:
316
- """Return the name of this class"""
317
- return cls.__name__
318
-
10
+ from palimpzest.utils.hash_helpers import hash_for_serialized_dict
11
+
12
+ # DEFINITIONS
13
+ PANDAS_DTYPE_TO_PYDANTIC = {
14
+ "object": str,
15
+ "bool": bool,
16
+ "int64": int,
17
+ "float64": float,
18
+ }
19
+
20
+ # IMAGE TYPES
21
+ ImageFilepath = TypeAliasType('ImageFilepath', str)
22
+ ImageBase64 = TypeAliasType('ImageBase64', str)
23
+ ImageURL = TypeAliasType('ImageURL', str)
24
+
25
+ # AUDIO TYPES
26
+ AudioFilepath = TypeAliasType('AudioFilepath', str)
27
+ AudioBase64 = TypeAliasType('AudioBase64', str)
28
+
29
+
30
+ def get_schema_field_names(schema: type[BaseModel], id: str | None = None) -> list[str]:
31
+ """Return the field names of a Pydantic model."""
32
+ return list(schema.model_fields) if id is None else [f"{schema.__name__}.{id}.{field_name}" for field_name in schema.model_fields]
33
+
34
+
35
+ def _create_pickleable_model(fields: dict[str, tuple[type, FieldInfo]]) -> type[BaseModel]:
36
+ """Create a Pydantic model that can be pickled."""
37
+ # create unique name for the unioned model
38
+ new_schema_name = f"Schema{sorted(fields.keys())}"
39
+ new_schema_id = hash_for_serialized_dict({
40
+ field_name: {"annotation": str(annotation), "default": str(field.default), "description": field.description}
41
+ for field_name, (annotation, field) in fields.items()
42
+ })
43
+
44
+ # if this class already exists, get it from the module and return
45
+ module = sys.modules[__name__]
46
+ if hasattr(module, new_schema_id):
47
+ return getattr(module, new_schema_id)
48
+
49
+ # create the class dynamically
50
+ new_model = create_model(new_schema_name, **fields)
51
+
52
+ # register it in the module's namespace so pickle can find it
53
+ module = sys.modules[__name__]
54
+ setattr(module, new_schema_id, new_model)
55
+ new_model.__module__ = module.__name__
56
+
57
+ return new_model
58
+
59
+
60
+ def project(model: type[BaseModel], project_fields: list[str]) -> type[BaseModel]:
61
+ """Project a Pydantic model to only the specified columns."""
62
+ # make sure projection column names are shortened
63
+ project_fields = [field_name.split(".")[-1] for field_name in project_fields]
64
+
65
+ # build up the fields for the new schema
66
+ fields = {}
67
+ for field_name, field in model.model_fields.items():
68
+ if field_name in project_fields:
69
+ fields[field_name] = (field.annotation, field)
70
+
71
+ # create and return the new schema
72
+ return _create_pickleable_model(fields)
73
+
74
+
75
+ def create_schema_from_fields(fields: list[dict]) -> type[BaseModel]:
76
+ """Create a Pydantic model from a list of fields."""
77
+ fields_ = {}
78
+ for field in fields:
79
+ assert "name" in field, "fields must contain a 'name' key"
80
+ assert "type" in field, "fields must contain a 'type' key"
81
+ assert "desc" in field or "description" in field, "fields must contain a 'description' key"
82
+
83
+ # for backwards compatability, rename "desc" to "description"
84
+ if "desc" in field:
85
+ field["description"] = field.pop("desc")
86
+ field_name = field["name"]
87
+ field_type = field["type"]
88
+ fields_[field_name] = (field_type, Field(**{k: v for k, v in field.items() if k not in ["name", "type"]}))
89
+
90
+ return _create_pickleable_model(fields_)
91
+
92
+
93
+ def create_schema_from_df(df: pd.DataFrame) -> type[BaseModel]:
94
+ """Create a Pydantic model from a Pandas DataFrame."""
95
+ fields = {}
96
+ for column, dtype in zip(df.columns, df.dtypes):
97
+ column = f"column_{column}" if isinstance(column, int) else column
98
+ field_desc = f"The {column} column from an input DataFrame"
99
+ annotation = PANDAS_DTYPE_TO_PYDANTIC.get(str(dtype), Any)
100
+ fields[column] = (annotation, Field(description=field_desc))
101
+
102
+ # create and return the new schema
103
+ return _create_pickleable_model(fields)
104
+
105
+
106
+ def union_schemas(models: list[type[BaseModel]], join: bool = False) -> type[BaseModel]:
107
+ """Union multiple Pydantic models into a single model."""
108
+ fields = {}
109
+ for model in models:
110
+ for field_name, field in model.model_fields.items():
111
+ if field_name in fields and not join:
112
+ assert fields[field_name][0] == field.annotation, f"Field {field_name} has different types in different models"
113
+ elif field_name in fields and join:
114
+ while field_name in fields:
115
+ field_name = f"{field_name}_right"
116
+ fields[field_name] = (field.annotation, field)
117
+
118
+ # create and return the new schema
119
+ return _create_pickleable_model(fields)
319
120
 
320
121
  ###################################################################################
321
122
  # "Core" useful Schemas. These are Schemas that almost everyone will need.
@@ -324,120 +125,81 @@ class Schema(metaclass=SchemaMetaclass):
324
125
 
325
126
 
326
127
  # First-level Schema's
327
- class DefaultSchema(Schema):
128
+ class DefaultSchema(BaseModel):
328
129
  """Store context data."""
130
+ value: Any = Field(description="The value of the input data")
329
131
 
330
- value = Field(desc="The value of the input data")
331
-
332
-
333
- class Download(Schema):
132
+ class Download(BaseModel):
334
133
  """A download is a URL and the contents of the download."""
134
+ url: str = Field(description="The URL of the download")
135
+ content: bytes = Field(description="The contents of the download")
136
+ timestamp: str = Field(description="The timestamp of the download")
335
137
 
336
- url = StringField(desc="The URL of the download")
337
- content = BytesField(desc="The contents of the download")
338
- timestamp = StringField(desc="The timestamp of the download")
339
-
340
-
341
- class File(Schema):
138
+ class File(BaseModel):
342
139
  """
343
140
  A File is defined by two Fields:
344
141
  - the filename (string)
345
142
  - the contents of the file (bytes)
346
143
  """
144
+ filename: str = Field(description="The UNIX-style name of the file")
145
+ contents: bytes = Field(description="The contents of the file")
347
146
 
348
- filename = StringField(desc="The UNIX-style name of the file")
349
- contents = BytesField(desc="The contents of the file")
350
-
351
- class TextFile(Schema):
147
+ class TextFile(BaseModel):
352
148
  """A text file is a File that contains only text. No binary data."""
353
- filename = StringField(desc="The UNIX-style name of the file")
354
- contents = StringField(desc="The contents of the file")
149
+ filename: str = Field(description="The UNIX-style name of the file")
150
+ contents: str = Field(description="The contents of the file")
355
151
 
356
- class Number(Schema):
357
- """Just a number. Often used for aggregates"""
152
+ class Average(BaseModel):
153
+ average: float = Field(description="The average value of items in the dataset")
358
154
 
359
- value = NumericField(desc="The value of a number")
155
+ class Count(BaseModel):
156
+ count: int = Field(description="The count of items in the dataset")
360
157
 
361
-
362
- class OperatorDerivedSchema(Schema):
158
+ class OperatorDerivedSchema(BaseModel):
363
159
  """Schema defined by an operator, e.g., a join or a group by"""
364
160
 
365
-
366
- class RawJSONObject(Schema):
367
- """A JSON object, which is a dictionary of key-value pairs."""
368
-
369
- json = StringField(desc="String representation of a JSON object")
370
-
371
-
372
- list_of_strings = ListField(StringField)
373
- list_of_lists = ListField(ListField)
374
- class Table(Schema):
161
+ class Table(BaseModel):
375
162
  """A Table is an object composed of a header and rows."""
163
+ filename: str = Field(description="The name of the file the table was extracted from")
164
+ name: str = Field(description="The name of the table")
165
+ header: list[str] = Field(description="The header of the table")
166
+ rows: list[list] = Field(description="The rows of the table")
376
167
 
377
- filename = StringField(desc="The name of the file the table was extracted from")
378
- name = StringField(desc="The name of the table")
379
- header = list_of_strings(desc="The header of the table")
380
- # TODO currently no support for nesting data records on data records
381
- rows = list_of_lists(desc="The rows of the table")
382
-
383
- def field_to_json(self, field_name: str, field_value: TypingAny) -> TypingAny:
384
- """Return a truncated JSON representation for `rows` and a string representation for `header`"""
385
- # take the first MAX_ROWS rows in the record_dict and turn them into comma separated strings
386
- if field_name == "rows":
387
- return [",".join(map(str, row)) + "\n" for row in field_value[:MAX_ROWS]]
388
-
389
- elif field_name == "header":
390
- return ",".join(field_value)
391
-
392
- return field_value
393
-
394
-
395
- class URL(Schema):
168
+ class URL(BaseModel):
396
169
  """A URL is a string that represents a web address."""
170
+ url: str = Field(description="A URL")
397
171
 
398
- url = StringField(desc="A URL")
399
-
400
-
401
- class WebPage(Schema):
172
+ class WebPage(BaseModel):
402
173
  """A web page is a URL and the contents of the page."""
403
-
404
- # url = StringField(desc="The URL of the web page")
405
- text = StringField(desc="The text contents of the web page")
406
- html = StringField(desc="The html contents of the web page")
407
- timestamp = StringField(desc="The timestamp of the download")
408
- filename = StringField(desc="The name of the file the web page was downloaded from")
409
-
174
+ text: str = Field(description="The text contents of the web page")
175
+ html: str = Field(description="The html contents of the web page")
176
+ timestamp: str = Field(description="The timestamp of the download")
177
+ filename: str = Field(description="The name of the file the web page was downloaded from")
410
178
 
411
179
  # Second-level Schemas
412
180
  class ImageFile(File):
413
181
  """A file that contains an image."""
182
+ contents: ImageBase64 = Field(description="The contents of the image encoded as a base64 string")
414
183
 
415
- contents = ImageBase64Field(desc="The contents of the image")
416
-
184
+ class AudioFile(File):
185
+ """A file that contains audio."""
186
+ contents: AudioBase64 = Field(description="The contents of an audio recording encoded as a base64 string")
417
187
 
418
188
  class PDFFile(File):
419
189
  """A PDF file is a File that is a PDF. It has specialized fields, font information, etc."""
420
-
421
190
  # This class is currently very impoverished. It needs a lot more fields before it can correctly represent a PDF.
422
- text_contents = StringField(desc="The text-only contents of the PDF")
191
+ text_contents: str = Field(description="The text-only contents of the PDF")
423
192
 
424
-
425
- list_of_numbers = ListField(NumericField)
426
193
  class XLSFile(File):
427
194
  """An XLS file is a File that contains one or more Excel spreadsheets."""
428
-
429
- number_sheets = NumericField(desc="The number of sheets in the Excel file")
430
- sheet_names = list_of_numbers(desc="The names of the sheets in the Excel file")
431
-
195
+ number_sheets: int = Field(description="The number of sheets in the Excel file")
196
+ sheet_names: list[str] = Field(description="The names of the sheets in the Excel file")
432
197
 
433
198
  # Third-level Schemas
434
199
  class EquationImage(ImageFile):
435
200
  """An image that contains a mathematical equation."""
436
-
437
- equation_text = StringField(desc="The text representation of the equation in the image")
438
-
201
+ equation_text: str = Field(description="The text representation of the equation in the image")
439
202
 
440
203
  class PlotImage(ImageFile):
441
204
  """An image that contains a plot, such as a graph or chart."""
442
-
443
- plot_description = StringField(desc="A description of the plot")
205
+ plot_description: str = Field(description="A description of the plot")