palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. palimpzest/__init__.py +37 -6
  2. palimpzest/agents/__init__.py +0 -0
  3. palimpzest/agents/compute_agents.py +0 -0
  4. palimpzest/agents/search_agents.py +637 -0
  5. palimpzest/constants.py +343 -209
  6. palimpzest/core/data/context.py +393 -0
  7. palimpzest/core/data/context_manager.py +163 -0
  8. palimpzest/core/data/dataset.py +639 -0
  9. palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
  10. palimpzest/core/elements/groupbysig.py +16 -13
  11. palimpzest/core/elements/records.py +166 -75
  12. palimpzest/core/lib/schemas.py +152 -390
  13. palimpzest/core/{data/dataclasses.py → models.py} +306 -170
  14. palimpzest/policy.py +2 -27
  15. palimpzest/prompts/__init__.py +35 -5
  16. palimpzest/prompts/agent_prompts.py +357 -0
  17. palimpzest/prompts/context_search.py +9 -0
  18. palimpzest/prompts/convert_prompts.py +62 -6
  19. palimpzest/prompts/filter_prompts.py +51 -6
  20. palimpzest/prompts/join_prompts.py +163 -0
  21. palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
  22. palimpzest/prompts/prompt_factory.py +375 -47
  23. palimpzest/prompts/split_proposer_prompts.py +1 -1
  24. palimpzest/prompts/util_phrases.py +5 -0
  25. palimpzest/prompts/validator.py +239 -0
  26. palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
  27. palimpzest/query/execution/execution_strategy.py +210 -317
  28. palimpzest/query/execution/execution_strategy_type.py +5 -7
  29. palimpzest/query/execution/mab_execution_strategy.py +249 -136
  30. palimpzest/query/execution/parallel_execution_strategy.py +153 -244
  31. palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
  32. palimpzest/query/generators/generators.py +160 -331
  33. palimpzest/query/operators/__init__.py +15 -5
  34. palimpzest/query/operators/aggregate.py +50 -33
  35. palimpzest/query/operators/compute.py +201 -0
  36. palimpzest/query/operators/convert.py +33 -19
  37. palimpzest/query/operators/critique_and_refine_convert.py +7 -5
  38. palimpzest/query/operators/distinct.py +62 -0
  39. palimpzest/query/operators/filter.py +26 -16
  40. palimpzest/query/operators/join.py +403 -0
  41. palimpzest/query/operators/limit.py +3 -3
  42. palimpzest/query/operators/logical.py +205 -77
  43. palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
  44. palimpzest/query/operators/physical.py +27 -21
  45. palimpzest/query/operators/project.py +3 -3
  46. palimpzest/query/operators/rag_convert.py +7 -7
  47. palimpzest/query/operators/retrieve.py +9 -9
  48. palimpzest/query/operators/scan.py +81 -42
  49. palimpzest/query/operators/search.py +524 -0
  50. palimpzest/query/operators/split_convert.py +10 -8
  51. palimpzest/query/optimizer/__init__.py +7 -9
  52. palimpzest/query/optimizer/cost_model.py +108 -441
  53. palimpzest/query/optimizer/optimizer.py +123 -181
  54. palimpzest/query/optimizer/optimizer_strategy.py +66 -61
  55. palimpzest/query/optimizer/plan.py +352 -67
  56. palimpzest/query/optimizer/primitives.py +43 -19
  57. palimpzest/query/optimizer/rules.py +484 -646
  58. palimpzest/query/optimizer/tasks.py +127 -58
  59. palimpzest/query/processor/config.py +42 -76
  60. palimpzest/query/processor/query_processor.py +73 -18
  61. palimpzest/query/processor/query_processor_factory.py +46 -38
  62. palimpzest/schemabuilder/schema_builder.py +15 -28
  63. palimpzest/utils/model_helpers.py +32 -77
  64. palimpzest/utils/progress.py +114 -102
  65. palimpzest/validator/__init__.py +0 -0
  66. palimpzest/validator/validator.py +306 -0
  67. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
  68. palimpzest-0.8.1.dist-info/RECORD +95 -0
  69. palimpzest/core/lib/fields.py +0 -141
  70. palimpzest/prompts/code_synthesis_prompts.py +0 -28
  71. palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
  72. palimpzest/query/generators/api_client_factory.py +0 -30
  73. palimpzest/query/operators/code_synthesis_convert.py +0 -488
  74. palimpzest/query/operators/map.py +0 -130
  75. palimpzest/query/processor/nosentinel_processor.py +0 -33
  76. palimpzest/query/processor/processing_strategy_type.py +0 -28
  77. palimpzest/query/processor/sentinel_processor.py +0 -88
  78. palimpzest/query/processor/streaming_processor.py +0 -149
  79. palimpzest/sets.py +0 -405
  80. palimpzest/utils/datareader_helpers.py +0 -61
  81. palimpzest/utils/demo_helpers.py +0 -75
  82. palimpzest/utils/field_helpers.py +0 -69
  83. palimpzest/utils/generation_helpers.py +0 -69
  84. palimpzest/utils/sandbox.py +0 -183
  85. palimpzest-0.7.21.dist-info/RECORD +0 -95
  86. /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
  87. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
  88. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
  89. {palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0
@@ -4,69 +4,62 @@ import base64
4
4
  import os
5
5
  from abc import ABC, abstractmethod
6
6
  from io import BytesIO
7
+ from pathlib import Path
7
8
 
8
9
  import pandas as pd
9
10
  from bs4 import BeautifulSoup
11
+ from pydantic import BaseModel
10
12
 
11
13
  from palimpzest import constants
14
+ from palimpzest.core.data import dataset
12
15
  from palimpzest.core.lib.schemas import (
16
+ AudioFile,
13
17
  DefaultSchema,
14
- File,
15
18
  ImageFile,
16
19
  PDFFile,
17
- Schema,
18
20
  TextFile,
19
21
  WebPage,
20
22
  XLSFile,
23
+ create_schema_from_df,
24
+ create_schema_from_fields,
21
25
  )
26
+ from palimpzest.query.operators.logical import BaseScan
22
27
  from palimpzest.tools.pdfparser import get_text_from_pdf
23
28
 
24
29
 
25
- # First level of abstraction
26
- class DataReader(ABC):
30
+ ####################
31
+ ### BASE CLASSES ###
32
+ ####################
33
+ class IterDataset(dataset.Dataset, ABC):
27
34
  """
28
- The `DataReader` is a base class for which may be used to generate data that
29
- is processed by PZ.
35
+ The `IterDataset` is an abstract base class for root `Datasets` whose data is accessed
36
+ via iteration. Classes which inherit from this class must implement two methods:
30
37
 
31
- Subclasses of the (abstract) `DataReader` class must implement two methods:
32
-
33
- - `__len__()`: which returns the number of elements in the data source
38
+ - `__len__()`: which returns the number of elements in the dataset
34
39
  - `__getitem__(idx: int)`: which takes in an `idx` and returns the element at that index
35
40
  """
36
41
 
37
- def __init__(self, schema: type[Schema] | list[dict]) -> None:
42
+ def __init__(self, id: str, schema: type[BaseModel] | list[dict]) -> None:
38
43
  """
39
- Constructor for the `DataReader` class.
44
+ Constructor for the `IterDataset` class.
40
45
 
41
46
  Args:
42
- schema (Schema | list[dict]): The output schema of the records returned by the DataReader
47
+ id (str): a string identifier for the `Dataset`
48
+ schema (BaseModel | list[dict]): The output schema of the records returned by the `Dataset`
43
49
  """
44
- # NOTE: _schema attribute currently has to match attribute name in Dataset
45
- self._schema = Schema.from_json(schema) if isinstance(schema, list) else schema
46
-
47
- def __eq__(self, __value: object) -> bool:
48
- return self.__dict__ == __value.__dict__
49
-
50
- def __str__(self) -> str:
51
- return f"{self.__class__.__name__}(schema={self.schema})"
52
-
53
- @property
54
- def schema(self) -> Schema:
55
- return self._schema
56
-
57
- # NOTE: currently used by optimizer to compute node id for DataReaders
58
- def serialize(self) -> dict:
59
- return {"schema": self._schema.json_schema()}
50
+ # compute Schema and call parent constructor
51
+ schema = create_schema_from_fields(schema) if isinstance(schema, list) else schema
52
+ super().__init__(sources=None, operator=BaseScan(datasource=self, output_schema=schema), schema=schema, id=id)
60
53
 
61
54
  @abstractmethod
62
55
  def __len__(self) -> int:
63
- """Returns the number of items in the data reader."""
56
+ """Returns the number of items in the `Dataset`."""
64
57
  pass
65
58
 
66
59
  @abstractmethod
67
60
  def __getitem__(self, idx: int) -> dict:
68
61
  """
69
- Returns a single item from the data reader at the given index.
62
+ Returns a single item from the `Dataset` at the given index.
70
63
 
71
64
  Args:
72
65
  idx (int): The index of the item to return
@@ -74,7 +67,7 @@ class DataReader(ABC):
74
67
  Returns:
75
68
  dict: A dictionary representing the item at the given index. The dictionary
76
69
  keys (i.e. fields) should match the fields specified in the schema of the
77
- data source, and the values should be the values associated with those fields.
70
+ dataset, and the values should be the values associated with those fields.
78
71
 
79
72
  # Example return value
80
73
  {"field1": value1, "field2": value2, ...}
@@ -83,111 +76,103 @@ class DataReader(ABC):
83
76
  pass
84
77
 
85
78
 
86
- # Second level of abstraction
87
- class DirectoryReader(DataReader):
79
+ class BaseFileDataset(IterDataset):
88
80
  """
89
- DirectoryReader returns a dictionary for each file in a directory. Each dictionary contains the filename and
90
- contents of a single file in the directory.
81
+ BaseFileDataset is the base class for multiple `IterDatasets` which iterate over
82
+ different types of files.
91
83
  """
92
84
 
93
- def __init__(self, path: str, schema: Schema) -> None:
85
+ def __init__(self, path: str, **kwargs) -> None:
94
86
  """
95
- Constructor for the `DirectoryReader` class.
87
+ Constructor for the `BaseFileDataset` class.
96
88
 
97
89
  Args:
98
- path (str): The path to the directory
99
- schema (Schema): The output schema of the data source
90
+ path (str): The path to the file
91
+ kwargs (dict): Keyword arguments containing the `Dataset's` id and file-specific `Schema`
100
92
  """
101
- assert os.path.isdir(path), f"Path {path} is not a directory"
93
+ # check that path is a valid file or directory
94
+ assert os.path.isfile(path) or os.path.isdir(path), f"Path {path} is not a file nor a directory"
102
95
 
103
- self.filepaths = [
104
- os.path.join(path, filename)
105
- for filename in sorted(os.listdir(path))
106
- if os.path.isfile(os.path.join(path, filename))
107
- ]
108
- self.path = path
109
- super().__init__(schema)
96
+ # get list of filepaths
97
+ self.filepaths = []
98
+ if os.path.isfile(path):
99
+ self.filepaths = [path]
100
+ else:
101
+ self.filepaths = [
102
+ os.path.join(path, filename)
103
+ for filename in sorted(os.listdir(path))
104
+ if os.path.isfile(os.path.join(path, filename))
105
+ ]
110
106
 
111
- def serialize(self) -> dict:
112
- return {
113
- "schema": self.schema.json_schema(),
114
- "path": self.path,
115
- "source_type": "directory",
116
- }
107
+ # call parent constructor to set id, operator, and schema
108
+ super().__init__(**kwargs)
117
109
 
118
110
  def __len__(self) -> int:
119
111
  return len(self.filepaths)
120
112
 
121
113
 
122
- class FileReader(DataReader):
123
- """FileReader returns a single dictionary with the filename and contents of a local file (in bytes)."""
114
+ class BaseFileDirectoryDataset(IterDataset):
115
+ """
116
+ BaseFileDirectoryDataset is the base class for multiple `IterDatasets` which iterate over
117
+ different types of files. This class walks the entire directory tree rooted at `path`.
118
+ """
124
119
 
125
- def __init__(self, path: str) -> None:
120
+ def __init__(self, path: str, **kwargs) -> None:
126
121
  """
127
- Constructor for the `FileReader` class. The `schema` is set to the default `File` schema.
122
+ Constructor for the `BaseFileDataset` class.
128
123
 
129
124
  Args:
130
125
  path (str): The path to the file
131
- """
132
- super().__init__(File)
133
- self.filepath = path
134
-
135
- def serialize(self) -> dict:
136
- return {
137
- "schema": self.schema.json_schema(),
138
- "path": self.filepath,
139
- "source_type": "file",
140
- }
126
+ kwargs (dict): Keyword arguments containing the `Dataset's` id and file-specific `Schema`
127
+ """
128
+ # check that path is a valid file or directory
129
+ assert os.path.isfile(path) or os.path.isdir(path), f"Path {path} is not a file nor a directory"
130
+
131
+ # get list of filepaths
132
+ self.filepaths = []
133
+ if os.path.isfile(path):
134
+ self.filepaths = [path]
135
+ else:
136
+ self.filepaths = []
137
+ for root, _, files in os.walk(path):
138
+ for file in files:
139
+ fp = os.path.join(root, file)
140
+ self.filepaths.append(fp)
141
+ self.filepaths = sorted(self.filepaths)
142
+
143
+ # call parent constructor to set id, operator, and schema
144
+ super().__init__(**kwargs)
141
145
 
142
146
  def __len__(self) -> int:
143
- return 1
144
-
145
- def __getitem__(self, idx: int) -> dict:
146
- """
147
- Returns a dictionary with the filename and contents of the file.
148
-
149
- Args:
150
- idx (int): The index of the item to return. This argument is ignored.
151
-
152
- Returns:
153
- dict: A dictionary with the filename and contents of the file.
154
-
155
- .. code-block:: python
156
-
157
- {
158
- "filename": "path/to/file.txt",
159
- "contents": b"file contents here",
160
- }
161
- """
162
- filename = self.filepath
163
- with open(self.filepath, "rb") as f:
164
- contents = f.read()
165
-
166
- return {"filename": filename, "contents": contents}
167
-
147
+ return len(self.filepaths)
168
148
 
169
- class MemoryReader(DataReader):
149
+ ########################
150
+ ### CONCRETE CLASSES ###
151
+ ########################
152
+ class MemoryDataset(IterDataset):
170
153
  """
171
- MemoryReader returns one or more dictionaries that reflect the contents of an in-memory Python object `vals`.
154
+ MemoryDataset returns one or more dictionaries that reflect the contents of an in-memory Python object `vals`.
172
155
  If `vals` is not a pd.DataFrame, then the dictionary returned by `__getitem__()` has a single field called "value".
173
156
  Otherwise, the dictionary contains the key-value mapping from columns to values for the `idx` row in the dataframe.
174
157
 
175
158
  TODO(gerardo): Add support for other types of in-memory data structures (he has some code for subclassing
176
- MemoryReader on his branch)
159
+ MemoryDataset on his branch)
177
160
  """
178
161
 
179
- def __init__(self, vals: list | pd.DataFrame) -> None:
162
+ def __init__(self, id: str, vals: list | pd.DataFrame, schema: type[BaseModel] | list[dict] | None = None) -> None:
180
163
  """
181
- Constructor for the `MemoryReader` class. The `schema` is set to the default `DefaultSchema` schema.
164
+ Constructor for the `MemoryDataset` class. The `schema` is set to the default `DefaultSchema` schema.
182
165
  If `vals` is a pd.DataFrame, then the schema is set to the schema inferred from the DataFrame.
183
166
 
184
167
  Args:
185
- vals (Any): The in-memory object to use as the data source
168
+ id (str): a string identifier for the `Dataset`
169
+ vals (Any): The in-memory data to iterate over
186
170
  """
187
171
  # if list[dict] --> convert to pd.DataFrame first
188
172
  self.vals = pd.DataFrame(vals) if isinstance(vals, list) and all([isinstance(item, dict) for item in vals]) else vals
189
- schema = Schema.from_df(self.vals) if isinstance(self.vals, pd.DataFrame) else DefaultSchema
190
- super().__init__(schema)
173
+ if schema is None:
174
+ schema = create_schema_from_df(self.vals) if isinstance(self.vals, pd.DataFrame) else DefaultSchema
175
+ super().__init__(id=id, schema=schema)
191
176
 
192
177
  def __len__(self) -> int:
193
178
  return len(self.vals)
@@ -228,20 +213,20 @@ class MemoryReader(DataReader):
228
213
  return item
229
214
 
230
215
 
231
- # Third level of abstraction
232
- class HTMLFileDirectoryReader(DirectoryReader):
216
+ class HTMLFileDataset(BaseFileDataset):
233
217
  """
234
- HTMLFileDirectoryReader returns a dictionary for each HTML file in a directory. Each dictionary contains the
218
+ HTMLFileDataset returns a dictionary for each HTML file in a directory. Each dictionary contains the
235
219
  filename, raw HTML content, and parsed content of a single HTML file in the directory.
236
220
  """
237
- def __init__(self, path: str) -> None:
221
+ def __init__(self, id: str, path: str) -> None:
238
222
  """
239
- Constructor for the `HTMLFileDirectoryReader` class. The `schema` is set to the `WebPage` schema.
223
+ Constructor for the `HTMLFileDataset` class. The `schema` is set to the `WebPage` schema.
240
224
 
241
225
  Args:
226
+ id (str): a string identifier for the `Dataset`
242
227
  path (str): The path to the directory
243
228
  """
244
- super().__init__(path=path, schema=WebPage)
229
+ super().__init__(path=path, id=id, schema=WebPage)
245
230
  assert all([filename.endswith(tuple(constants.HTML_EXTENSIONS)) for filename in self.filepaths])
246
231
 
247
232
  def _html_to_text_with_links(self, html: str) -> str:
@@ -296,19 +281,20 @@ class HTMLFileDirectoryReader(DirectoryReader):
296
281
  return item
297
282
 
298
283
 
299
- class ImageFileDirectoryReader(DirectoryReader):
284
+ class ImageFileDataset(BaseFileDataset):
300
285
  """
301
- ImageFileDirectoryReader returns a dictionary for each image file in a directory. Each dictionary contains the
286
+ ImageFileDataset returns a dictionary for each image file in a directory. Each dictionary contains the
302
287
  filename and the base64 encoded bytes content of a single image file in the directory.
303
288
  """
304
- def __init__(self, path: str) -> None:
289
+ def __init__(self, id: str, path: str) -> None:
305
290
  """
306
- Constructor for the `ImageFileDirectoryReader` class. The `schema` is set to the `ImageFile` schema.
291
+ Constructor for the `ImageFileDataset` class. The `schema` is set to the `ImageFile` schema.
307
292
 
308
293
  Args:
294
+ id (str): a string identifier for the `Dataset`
309
295
  path (str): The path to the directory
310
296
  """
311
- super().__init__(path=path, schema=ImageFile)
297
+ super().__init__(path=path, id=id, schema=ImageFile)
312
298
  assert all([filename.endswith(tuple(constants.IMAGE_EXTENSIONS)) for filename in self.filepaths])
313
299
 
314
300
  def __getitem__(self, idx: int) -> dict:
@@ -332,33 +318,35 @@ class ImageFileDirectoryReader(DirectoryReader):
332
318
  filepath = self.filepaths[idx]
333
319
  filename = os.path.basename(filepath)
334
320
  with open(filepath, "rb") as f:
335
- contents = base64.b64encode(f.read())
321
+ contents = base64.b64encode(f.read()).decode("utf-8")
336
322
 
337
323
  return {"filename": filename, "contents": contents}
338
324
 
339
325
 
340
- class PDFFileDirectoryReader(DirectoryReader):
326
+ class PDFFileDataset(BaseFileDataset):
341
327
  """
342
- PDFFileDirectoryReader returns a dictionary for each PDF file in a directory. Each dictionary contains the
328
+ PDFFileDataset returns a dictionary for each PDF file in a directory. Each dictionary contains the
343
329
  filename, raw PDF content, and parsed text content of a single PDF file in the directory.
344
330
 
345
331
  This class also uses one of a predefined set of PDF processors to extract text content from the PDF files.
346
332
  """
347
333
  def __init__(
348
334
  self,
335
+ id: str,
349
336
  path: str,
350
337
  pdfprocessor: str = "pypdf",
351
338
  file_cache_dir: str = "/tmp",
352
339
  ) -> None:
353
340
  """
354
- Constructor for the `PDFFileDirectoryReader` class. The `schema` is set to the `PDFFile` schema.
341
+ Constructor for the `PDFFileDataset` class. The `schema` is set to the `PDFFile` schema.
355
342
 
356
343
  Args:
344
+ id (str): a string identifier for the `Dataset`
357
345
  path (str): The path to the directory
358
346
  pdfprocessor (str): The PDF processor to use for extracting text content from the PDF files
359
347
  file_cache_dir (str): The directory to store the temporary files generated during PDF processing
360
348
  """
361
- super().__init__(path=path, schema=PDFFile)
349
+ super().__init__(path=path, id=id, schema=PDFFile)
362
350
  assert all([filename.endswith(tuple(constants.PDF_EXTENSIONS)) for filename in self.filepaths])
363
351
  self.pdfprocessor = pdfprocessor
364
352
  self.file_cache_dir = file_cache_dir
@@ -394,19 +382,20 @@ class PDFFileDirectoryReader(DirectoryReader):
394
382
  return {"filename": pdf_filename, "contents": pdf_bytes, "text_contents": text_content}
395
383
 
396
384
 
397
- class TextFileDirectoryReader(DirectoryReader):
385
+ class TextFileDataset(BaseFileDataset):
398
386
  """
399
- TextFileDirectoryReader returns a dictionary for each text file in a directory. Each dictionary contains the
387
+ TextFileDataset returns a dictionary for each text file in a directory. Each dictionary contains the
400
388
  filename and contents of a single text file in the directory.
401
389
  """
402
- def __init__(self, path: str) -> None:
390
+ def __init__(self, id: str, path: str) -> None:
403
391
  """
404
- Constructor for the `TextFileDirectoryReader` class. The `schema` is set to the `TextFile` schema.
392
+ Constructor for the `TextFileDataset` class. The `schema` is set to the `TextFile` schema.
405
393
 
406
394
  Args:
395
+ id (str): a string identifier for the `Dataset`
407
396
  path (str): The path to the directory
408
397
  """
409
- super().__init__(path=path, schema=TextFile)
398
+ super().__init__(path=path, id=id, schema=TextFile)
410
399
 
411
400
  def __getitem__(self, idx: int) -> dict:
412
401
  """
@@ -433,16 +422,16 @@ class TextFileDirectoryReader(DirectoryReader):
433
422
  return {"filename": filename, "contents": contents}
434
423
 
435
424
 
436
- class XLSFileDirectoryReader(DirectoryReader):
425
+ class XLSFileDataset(BaseFileDataset):
437
426
  """
438
- XLSFileDirectoryReader returns a dictionary for each XLS file in a directory. Each dictionary contains the
427
+ XLSFileDataset returns a dictionary for each XLS file in a directory. Each dictionary contains the
439
428
  filename, contents, sheet names, and the number of sheets for a single XLS file in the directory.
440
429
  """
441
- def __init__(self, path: str) -> None:
430
+ def __init__(self, id: str, path: str) -> None:
442
431
  """
443
- Constructor for the `XLSFileDirectoryReader` class. The `schema` is set to the `XLSFile` schema.
432
+ Constructor for the `XLSFileDataset` class. The `schema` is set to the `XLSFile` schema.
444
433
  """
445
- super().__init__(path=path, schema=XLSFile)
434
+ super().__init__(path=path, id=id, schema=XLSFile)
446
435
  assert all([filename.endswith(tuple(constants.XLS_EXTENSIONS)) for filename in self.filepaths])
447
436
 
448
437
  def __getitem__(self, idx: int) -> dict:
@@ -478,3 +467,90 @@ class XLSFileDirectoryReader(DirectoryReader):
478
467
  "sheet_names": xls.sheet_names,
479
468
  "number_sheets": len(xls.sheet_names),
480
469
  }
470
+
471
+
472
+ class AudioFileDataset(BaseFileDirectoryDataset):
473
+ """
474
+ AudioFileDataset returns a dictionary for each audio file in a directory. Each dictionary contains the
475
+ filename and the base64 encoded bytes content of a single audio file in the directory.
476
+ """
477
+ def __init__(self, id: str, path: str) -> None:
478
+ """
479
+ Constructor for the `AudioFileDataset` class. The `schema` is set to the `AudioFile` schema.
480
+
481
+ Args:
482
+ id (str): a string identifier for the `Dataset`
483
+ path (str): The path to the directory
484
+ """
485
+ super().__init__(path=path, id=id, schema=AudioFile)
486
+ assert all([filename.endswith(tuple(constants.AUDIO_EXTENSIONS)) for filename in self.filepaths])
487
+
488
+ def __getitem__(self, idx: int) -> dict:
489
+ """
490
+ Returns a dictionary with the filename and base64 encoded bytes content of the audio file at the
491
+ specified `idx`.
492
+
493
+ Args:
494
+ idx (int): The index of the item to return
495
+
496
+ Returns:
497
+ dict: A dictionary with the filename and base64 encoded bytes content of the audio file.
498
+
499
+ .. code-block:: python
500
+
501
+ {
502
+ "filename": "audio.wav",
503
+ "contents": b"base64 encoded audio content here",
504
+ }
505
+ """
506
+ filepath = self.filepaths[idx]
507
+ filename = os.path.basename(filepath)
508
+ with open(filepath, "rb") as f:
509
+ contents = base64.b64encode(f.read()).decode("utf-8")
510
+
511
+ return {"filename": filename, "contents": contents}
512
+
513
+
514
+ def get_local_source(id: str, path: str | Path, **kwargs) -> dataset.Dataset:
515
+ """Return a `Dataset` for a local file or directory."""
516
+ if os.path.isfile(path):
517
+ return TextFileDataset(id, path)
518
+
519
+ elif os.path.isdir(path):
520
+ if all([f.endswith(tuple(constants.IMAGE_EXTENSIONS)) for f in os.listdir(path)]):
521
+ return ImageFileDataset(id, path)
522
+
523
+ elif all([f.endswith(tuple(constants.PDF_EXTENSIONS)) for f in os.listdir(path)]):
524
+ pdfprocessor = kwargs.get("pdfprocessor", constants.DEFAULT_PDF_PROCESSOR)
525
+ file_cache_dir = kwargs.get("file_cache_dir", "/tmp")
526
+ return PDFFileDataset(
527
+ id=id, path=path, pdfprocessor=pdfprocessor, file_cache_dir=file_cache_dir
528
+ )
529
+
530
+ elif all([f.endswith(tuple(constants.XLS_EXTENSIONS)) for f in os.listdir(path)]):
531
+ return XLSFileDataset(id, path)
532
+
533
+ elif all([f.endswith(tuple(constants.HTML_EXTENSIONS)) for f in os.listdir(path)]):
534
+ return HTMLFileDataset(id, path)
535
+
536
+ else:
537
+ return TextFileDataset(id, path)
538
+ else:
539
+ raise ValueError(f"Path {path} is invalid. Does not point to a file or directory.")
540
+
541
+
542
+ def resolve_datasource(id: str, source: str | Path | list | pd.DataFrame, **kwargs) -> dataset.Dataset:
543
+ """
544
+ This helper function returns a `Dataset` object based on the `source` type.
545
+ The returned `Dataset` object is guaranteed to have a schema.
546
+ """
547
+ if isinstance(source, (str, Path)):
548
+ source = get_local_source(id, source, **kwargs)
549
+
550
+ elif isinstance(source, (list, pd.DataFrame)):
551
+ source = MemoryDataset(id=id, vals=source)
552
+
553
+ else:
554
+ raise ValueError(f"Invalid source type: {type(source)}, We only support str, Path, list[dict], and pd.DataFrame")
555
+
556
+ return source
@@ -2,10 +2,12 @@ from __future__ import annotations
2
2
 
3
3
  from typing import Any
4
4
 
5
- from palimpzest.core.lib.fields import Field
6
- from palimpzest.core.lib.schemas import OperatorDerivedSchema, Schema
5
+ from pydantic import BaseModel
7
6
 
7
+ from palimpzest.core.lib.schemas import create_schema_from_fields
8
8
 
9
+
10
+ # TODO: need to rethink how group bys work
9
11
  # signature for a group by aggregate that applies
10
12
  # group and aggregation to an input tuple
11
13
  class GroupBySig:
@@ -14,12 +16,12 @@ class GroupBySig:
14
16
  self.agg_funcs = agg_funcs
15
17
  self.agg_fields = agg_fields
16
18
 
17
- def validate_schema(self, input_schema: Schema) -> tuple[bool, str | None]:
19
+ def validate_schema(self, input_schema: BaseModel) -> tuple[bool, str | None]:
18
20
  for f in self.group_by_fields:
19
- if not hasattr(input_schema, f):
21
+ if f not in input_schema.model_fields:
20
22
  return (False, "Supplied schema has no field " + f)
21
23
  for f in self.agg_fields:
22
- if not hasattr(input_schema, f):
24
+ if f not in input_schema.model_fields:
23
25
  return (False, "Supplied schema has no field " + f)
24
26
  return (True, None)
25
27
 
@@ -48,16 +50,17 @@ class GroupBySig:
48
50
  ops.append(self.agg_funcs[i] + "(" + self.agg_fields[i] + ")")
49
51
  return ops
50
52
 
51
- def output_schema(self) -> type[OperatorDerivedSchema]:
53
+ def output_schema(self) -> type[BaseModel]:
52
54
  # the output class varies depending on the group by, so here
53
55
  # we dynamically construct this output
54
- schema = type("CustomGroupBy", (OperatorDerivedSchema,), {})
55
-
56
+ fields = []
56
57
  for g in self.group_by_fields:
57
- f = Field(desc=g)
58
- setattr(schema, g, f)
58
+ f = {"name": g, "type": Any, "desc": f"Group by field: {g}"}
59
+ fields.append(f)
60
+
59
61
  ops = self.get_agg_field_names()
60
62
  for op in ops:
61
- f = Field(desc=op)
62
- setattr(schema, op, f)
63
- return schema
63
+ f = {"name": op, "type": Any, "desc": f"Aggregate field: {op}"}
64
+ fields.append(f)
65
+
66
+ return create_schema_from_fields(fields)