palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +37 -6
- palimpzest/agents/__init__.py +0 -0
- palimpzest/agents/compute_agents.py +0 -0
- palimpzest/agents/search_agents.py +637 -0
- palimpzest/constants.py +259 -197
- palimpzest/core/data/context.py +393 -0
- palimpzest/core/data/context_manager.py +163 -0
- palimpzest/core/data/dataset.py +634 -0
- palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
- palimpzest/core/elements/groupbysig.py +16 -13
- palimpzest/core/elements/records.py +166 -75
- palimpzest/core/lib/schemas.py +152 -390
- palimpzest/core/{data/dataclasses.py → models.py} +306 -170
- palimpzest/policy.py +2 -27
- palimpzest/prompts/__init__.py +35 -5
- palimpzest/prompts/agent_prompts.py +357 -0
- palimpzest/prompts/context_search.py +9 -0
- palimpzest/prompts/convert_prompts.py +61 -5
- palimpzest/prompts/filter_prompts.py +50 -5
- palimpzest/prompts/join_prompts.py +163 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
- palimpzest/prompts/prompt_factory.py +358 -46
- palimpzest/prompts/validator.py +239 -0
- palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
- palimpzest/query/execution/execution_strategy.py +210 -317
- palimpzest/query/execution/execution_strategy_type.py +5 -7
- palimpzest/query/execution/mab_execution_strategy.py +249 -136
- palimpzest/query/execution/parallel_execution_strategy.py +153 -244
- palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
- palimpzest/query/generators/generators.py +157 -330
- palimpzest/query/operators/__init__.py +15 -5
- palimpzest/query/operators/aggregate.py +50 -33
- palimpzest/query/operators/compute.py +201 -0
- palimpzest/query/operators/convert.py +27 -21
- palimpzest/query/operators/critique_and_refine_convert.py +7 -5
- palimpzest/query/operators/distinct.py +62 -0
- palimpzest/query/operators/filter.py +22 -13
- palimpzest/query/operators/join.py +402 -0
- palimpzest/query/operators/limit.py +3 -3
- palimpzest/query/operators/logical.py +198 -80
- palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
- palimpzest/query/operators/physical.py +27 -21
- palimpzest/query/operators/project.py +3 -3
- palimpzest/query/operators/rag_convert.py +7 -7
- palimpzest/query/operators/retrieve.py +9 -9
- palimpzest/query/operators/scan.py +81 -42
- palimpzest/query/operators/search.py +524 -0
- palimpzest/query/operators/split_convert.py +10 -8
- palimpzest/query/optimizer/__init__.py +7 -9
- palimpzest/query/optimizer/cost_model.py +108 -441
- palimpzest/query/optimizer/optimizer.py +123 -181
- palimpzest/query/optimizer/optimizer_strategy.py +66 -61
- palimpzest/query/optimizer/plan.py +352 -67
- palimpzest/query/optimizer/primitives.py +43 -19
- palimpzest/query/optimizer/rules.py +484 -646
- palimpzest/query/optimizer/tasks.py +127 -58
- palimpzest/query/processor/config.py +41 -76
- palimpzest/query/processor/query_processor.py +73 -18
- palimpzest/query/processor/query_processor_factory.py +46 -38
- palimpzest/schemabuilder/schema_builder.py +15 -28
- palimpzest/utils/model_helpers.py +27 -77
- palimpzest/utils/progress.py +114 -102
- palimpzest/validator/__init__.py +0 -0
- palimpzest/validator/validator.py +306 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
- palimpzest-0.8.0.dist-info/RECORD +95 -0
- palimpzest/core/lib/fields.py +0 -141
- palimpzest/prompts/code_synthesis_prompts.py +0 -28
- palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
- palimpzest/query/generators/api_client_factory.py +0 -30
- palimpzest/query/operators/code_synthesis_convert.py +0 -488
- palimpzest/query/operators/map.py +0 -130
- palimpzest/query/processor/nosentinel_processor.py +0 -33
- palimpzest/query/processor/processing_strategy_type.py +0 -28
- palimpzest/query/processor/sentinel_processor.py +0 -88
- palimpzest/query/processor/streaming_processor.py +0 -149
- palimpzest/sets.py +0 -405
- palimpzest/utils/datareader_helpers.py +0 -61
- palimpzest/utils/demo_helpers.py +0 -75
- palimpzest/utils/field_helpers.py +0 -69
- palimpzest/utils/generation_helpers.py +0 -69
- palimpzest/utils/sandbox.py +0 -183
- palimpzest-0.7.20.dist-info/RECORD +0 -95
- /palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -4,69 +4,62 @@ import base64
|
|
|
4
4
|
import os
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
6
|
from io import BytesIO
|
|
7
|
+
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import pandas as pd
|
|
9
10
|
from bs4 import BeautifulSoup
|
|
11
|
+
from pydantic import BaseModel
|
|
10
12
|
|
|
11
13
|
from palimpzest import constants
|
|
14
|
+
from palimpzest.core.data import dataset
|
|
12
15
|
from palimpzest.core.lib.schemas import (
|
|
16
|
+
AudioFile,
|
|
13
17
|
DefaultSchema,
|
|
14
|
-
File,
|
|
15
18
|
ImageFile,
|
|
16
19
|
PDFFile,
|
|
17
|
-
Schema,
|
|
18
20
|
TextFile,
|
|
19
21
|
WebPage,
|
|
20
22
|
XLSFile,
|
|
23
|
+
create_schema_from_df,
|
|
24
|
+
create_schema_from_fields,
|
|
21
25
|
)
|
|
26
|
+
from palimpzest.query.operators.logical import BaseScan
|
|
22
27
|
from palimpzest.tools.pdfparser import get_text_from_pdf
|
|
23
28
|
|
|
24
29
|
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
####################
|
|
31
|
+
### BASE CLASSES ###
|
|
32
|
+
####################
|
|
33
|
+
class IterDataset(dataset.Dataset, ABC):
|
|
27
34
|
"""
|
|
28
|
-
The `
|
|
29
|
-
|
|
35
|
+
The `IterDataset` is an abstract base class for root `Datasets` whose data is accessed
|
|
36
|
+
via iteration. Classes which inherit from this class must implement two methods:
|
|
30
37
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
- `__len__()`: which returns the number of elements in the data source
|
|
38
|
+
- `__len__()`: which returns the number of elements in the dataset
|
|
34
39
|
- `__getitem__(idx: int)`: which takes in an `idx` and returns the element at that index
|
|
35
40
|
"""
|
|
36
41
|
|
|
37
|
-
def __init__(self, schema: type[
|
|
42
|
+
def __init__(self, id: str, schema: type[BaseModel] | list[dict]) -> None:
|
|
38
43
|
"""
|
|
39
|
-
Constructor for the `
|
|
44
|
+
Constructor for the `IterDataset` class.
|
|
40
45
|
|
|
41
46
|
Args:
|
|
42
|
-
|
|
47
|
+
id (str): a string identifier for the `Dataset`
|
|
48
|
+
schema (BaseModel | list[dict]): The output schema of the records returned by the `Dataset`
|
|
43
49
|
"""
|
|
44
|
-
#
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def __eq__(self, __value: object) -> bool:
|
|
48
|
-
return self.__dict__ == __value.__dict__
|
|
49
|
-
|
|
50
|
-
def __str__(self) -> str:
|
|
51
|
-
return f"{self.__class__.__name__}(schema={self.schema})"
|
|
52
|
-
|
|
53
|
-
@property
|
|
54
|
-
def schema(self) -> Schema:
|
|
55
|
-
return self._schema
|
|
56
|
-
|
|
57
|
-
# NOTE: currently used by optimizer to compute node id for DataReaders
|
|
58
|
-
def serialize(self) -> dict:
|
|
59
|
-
return {"schema": self._schema.json_schema()}
|
|
50
|
+
# compute Schema and call parent constructor
|
|
51
|
+
schema = create_schema_from_fields(schema) if isinstance(schema, list) else schema
|
|
52
|
+
super().__init__(sources=None, operator=BaseScan(datasource=self, output_schema=schema), schema=schema, id=id)
|
|
60
53
|
|
|
61
54
|
@abstractmethod
|
|
62
55
|
def __len__(self) -> int:
|
|
63
|
-
"""Returns the number of items in the
|
|
56
|
+
"""Returns the number of items in the `Dataset`."""
|
|
64
57
|
pass
|
|
65
58
|
|
|
66
59
|
@abstractmethod
|
|
67
60
|
def __getitem__(self, idx: int) -> dict:
|
|
68
61
|
"""
|
|
69
|
-
Returns a single item from the
|
|
62
|
+
Returns a single item from the `Dataset` at the given index.
|
|
70
63
|
|
|
71
64
|
Args:
|
|
72
65
|
idx (int): The index of the item to return
|
|
@@ -74,7 +67,7 @@ class DataReader(ABC):
|
|
|
74
67
|
Returns:
|
|
75
68
|
dict: A dictionary representing the item at the given index. The dictionary
|
|
76
69
|
keys (i.e. fields) should match the fields specified in the schema of the
|
|
77
|
-
|
|
70
|
+
dataset, and the values should be the values associated with those fields.
|
|
78
71
|
|
|
79
72
|
# Example return value
|
|
80
73
|
{"field1": value1, "field2": value2, ...}
|
|
@@ -83,111 +76,103 @@ class DataReader(ABC):
|
|
|
83
76
|
pass
|
|
84
77
|
|
|
85
78
|
|
|
86
|
-
|
|
87
|
-
class DirectoryReader(DataReader):
|
|
79
|
+
class BaseFileDataset(IterDataset):
|
|
88
80
|
"""
|
|
89
|
-
|
|
90
|
-
|
|
81
|
+
BaseFileDataset is the base class for multiple `IterDatasets` which iterate over
|
|
82
|
+
different types of files.
|
|
91
83
|
"""
|
|
92
84
|
|
|
93
|
-
def __init__(self, path: str,
|
|
85
|
+
def __init__(self, path: str, **kwargs) -> None:
|
|
94
86
|
"""
|
|
95
|
-
Constructor for the `
|
|
87
|
+
Constructor for the `BaseFileDataset` class.
|
|
96
88
|
|
|
97
89
|
Args:
|
|
98
|
-
path (str): The path to the
|
|
99
|
-
|
|
90
|
+
path (str): The path to the file
|
|
91
|
+
kwargs (dict): Keyword arguments containing the `Dataset's` id and file-specific `Schema`
|
|
100
92
|
"""
|
|
101
|
-
|
|
93
|
+
# check that path is a valid file or directory
|
|
94
|
+
assert os.path.isfile(path) or os.path.isdir(path), f"Path {path} is not a file nor a directory"
|
|
102
95
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
96
|
+
# get list of filepaths
|
|
97
|
+
self.filepaths = []
|
|
98
|
+
if os.path.isfile(path):
|
|
99
|
+
self.filepaths = [path]
|
|
100
|
+
else:
|
|
101
|
+
self.filepaths = [
|
|
102
|
+
os.path.join(path, filename)
|
|
103
|
+
for filename in sorted(os.listdir(path))
|
|
104
|
+
if os.path.isfile(os.path.join(path, filename))
|
|
105
|
+
]
|
|
110
106
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
"schema": self.schema.json_schema(),
|
|
114
|
-
"path": self.path,
|
|
115
|
-
"source_type": "directory",
|
|
116
|
-
}
|
|
107
|
+
# call parent constructor to set id, operator, and schema
|
|
108
|
+
super().__init__(**kwargs)
|
|
117
109
|
|
|
118
110
|
def __len__(self) -> int:
|
|
119
111
|
return len(self.filepaths)
|
|
120
112
|
|
|
121
113
|
|
|
122
|
-
class
|
|
123
|
-
"""
|
|
114
|
+
class BaseFileDirectoryDataset(IterDataset):
|
|
115
|
+
"""
|
|
116
|
+
BaseFileDirectoryDataset is the base class for multiple `IterDatasets` which iterate over
|
|
117
|
+
different types of files. This class walks the entire directory tree rooted at `path`.
|
|
118
|
+
"""
|
|
124
119
|
|
|
125
|
-
def __init__(self, path: str) -> None:
|
|
120
|
+
def __init__(self, path: str, **kwargs) -> None:
|
|
126
121
|
"""
|
|
127
|
-
Constructor for the `
|
|
122
|
+
Constructor for the `BaseFileDataset` class.
|
|
128
123
|
|
|
129
124
|
Args:
|
|
130
125
|
path (str): The path to the file
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
126
|
+
kwargs (dict): Keyword arguments containing the `Dataset's` id and file-specific `Schema`
|
|
127
|
+
"""
|
|
128
|
+
# check that path is a valid file or directory
|
|
129
|
+
assert os.path.isfile(path) or os.path.isdir(path), f"Path {path} is not a file nor a directory"
|
|
130
|
+
|
|
131
|
+
# get list of filepaths
|
|
132
|
+
self.filepaths = []
|
|
133
|
+
if os.path.isfile(path):
|
|
134
|
+
self.filepaths = [path]
|
|
135
|
+
else:
|
|
136
|
+
self.filepaths = []
|
|
137
|
+
for root, _, files in os.walk(path):
|
|
138
|
+
for file in files:
|
|
139
|
+
fp = os.path.join(root, file)
|
|
140
|
+
self.filepaths.append(fp)
|
|
141
|
+
self.filepaths = sorted(self.filepaths)
|
|
142
|
+
|
|
143
|
+
# call parent constructor to set id, operator, and schema
|
|
144
|
+
super().__init__(**kwargs)
|
|
141
145
|
|
|
142
146
|
def __len__(self) -> int:
|
|
143
|
-
return
|
|
144
|
-
|
|
145
|
-
def __getitem__(self, idx: int) -> dict:
|
|
146
|
-
"""
|
|
147
|
-
Returns a dictionary with the filename and contents of the file.
|
|
148
|
-
|
|
149
|
-
Args:
|
|
150
|
-
idx (int): The index of the item to return. This argument is ignored.
|
|
151
|
-
|
|
152
|
-
Returns:
|
|
153
|
-
dict: A dictionary with the filename and contents of the file.
|
|
154
|
-
|
|
155
|
-
.. code-block:: python
|
|
156
|
-
|
|
157
|
-
{
|
|
158
|
-
"filename": "path/to/file.txt",
|
|
159
|
-
"contents": b"file contents here",
|
|
160
|
-
}
|
|
161
|
-
"""
|
|
162
|
-
filename = self.filepath
|
|
163
|
-
with open(self.filepath, "rb") as f:
|
|
164
|
-
contents = f.read()
|
|
165
|
-
|
|
166
|
-
return {"filename": filename, "contents": contents}
|
|
167
|
-
|
|
147
|
+
return len(self.filepaths)
|
|
168
148
|
|
|
169
|
-
|
|
149
|
+
########################
|
|
150
|
+
### CONCRETE CLASSES ###
|
|
151
|
+
########################
|
|
152
|
+
class MemoryDataset(IterDataset):
|
|
170
153
|
"""
|
|
171
|
-
|
|
154
|
+
MemoryDataset returns one or more dictionaries that reflect the contents of an in-memory Python object `vals`.
|
|
172
155
|
If `vals` is not a pd.DataFrame, then the dictionary returned by `__getitem__()` has a single field called "value".
|
|
173
156
|
Otherwise, the dictionary contains the key-value mapping from columns to values for the `idx` row in the dataframe.
|
|
174
157
|
|
|
175
158
|
TODO(gerardo): Add support for other types of in-memory data structures (he has some code for subclassing
|
|
176
|
-
|
|
159
|
+
MemoryDataset on his branch)
|
|
177
160
|
"""
|
|
178
161
|
|
|
179
|
-
def __init__(self, vals: list | pd.DataFrame) -> None:
|
|
162
|
+
def __init__(self, id: str, vals: list | pd.DataFrame, schema: type[BaseModel] | list[dict] | None = None) -> None:
|
|
180
163
|
"""
|
|
181
|
-
Constructor for the `
|
|
164
|
+
Constructor for the `MemoryDataset` class. The `schema` is set to the default `DefaultSchema` schema.
|
|
182
165
|
If `vals` is a pd.DataFrame, then the schema is set to the schema inferred from the DataFrame.
|
|
183
166
|
|
|
184
167
|
Args:
|
|
185
|
-
|
|
168
|
+
id (str): a string identifier for the `Dataset`
|
|
169
|
+
vals (Any): The in-memory data to iterate over
|
|
186
170
|
"""
|
|
187
171
|
# if list[dict] --> convert to pd.DataFrame first
|
|
188
172
|
self.vals = pd.DataFrame(vals) if isinstance(vals, list) and all([isinstance(item, dict) for item in vals]) else vals
|
|
189
|
-
|
|
190
|
-
|
|
173
|
+
if schema is None:
|
|
174
|
+
schema = create_schema_from_df(self.vals) if isinstance(self.vals, pd.DataFrame) else DefaultSchema
|
|
175
|
+
super().__init__(id=id, schema=schema)
|
|
191
176
|
|
|
192
177
|
def __len__(self) -> int:
|
|
193
178
|
return len(self.vals)
|
|
@@ -228,20 +213,20 @@ class MemoryReader(DataReader):
|
|
|
228
213
|
return item
|
|
229
214
|
|
|
230
215
|
|
|
231
|
-
|
|
232
|
-
class HTMLFileDirectoryReader(DirectoryReader):
|
|
216
|
+
class HTMLFileDataset(BaseFileDataset):
|
|
233
217
|
"""
|
|
234
|
-
|
|
218
|
+
HTMLFileDataset returns a dictionary for each HTML file in a directory. Each dictionary contains the
|
|
235
219
|
filename, raw HTML content, and parsed content of a single HTML file in the directory.
|
|
236
220
|
"""
|
|
237
|
-
def __init__(self, path: str) -> None:
|
|
221
|
+
def __init__(self, id: str, path: str) -> None:
|
|
238
222
|
"""
|
|
239
|
-
Constructor for the `
|
|
223
|
+
Constructor for the `HTMLFileDataset` class. The `schema` is set to the `WebPage` schema.
|
|
240
224
|
|
|
241
225
|
Args:
|
|
226
|
+
id (str): a string identifier for the `Dataset`
|
|
242
227
|
path (str): The path to the directory
|
|
243
228
|
"""
|
|
244
|
-
super().__init__(path=path, schema=WebPage)
|
|
229
|
+
super().__init__(path=path, id=id, schema=WebPage)
|
|
245
230
|
assert all([filename.endswith(tuple(constants.HTML_EXTENSIONS)) for filename in self.filepaths])
|
|
246
231
|
|
|
247
232
|
def _html_to_text_with_links(self, html: str) -> str:
|
|
@@ -296,19 +281,20 @@ class HTMLFileDirectoryReader(DirectoryReader):
|
|
|
296
281
|
return item
|
|
297
282
|
|
|
298
283
|
|
|
299
|
-
class
|
|
284
|
+
class ImageFileDataset(BaseFileDataset):
|
|
300
285
|
"""
|
|
301
|
-
|
|
286
|
+
ImageFileDataset returns a dictionary for each image file in a directory. Each dictionary contains the
|
|
302
287
|
filename and the base64 encoded bytes content of a single image file in the directory.
|
|
303
288
|
"""
|
|
304
|
-
def __init__(self, path: str) -> None:
|
|
289
|
+
def __init__(self, id: str, path: str) -> None:
|
|
305
290
|
"""
|
|
306
|
-
Constructor for the `
|
|
291
|
+
Constructor for the `ImageFileDataset` class. The `schema` is set to the `ImageFile` schema.
|
|
307
292
|
|
|
308
293
|
Args:
|
|
294
|
+
id (str): a string identifier for the `Dataset`
|
|
309
295
|
path (str): The path to the directory
|
|
310
296
|
"""
|
|
311
|
-
super().__init__(path=path, schema=ImageFile)
|
|
297
|
+
super().__init__(path=path, id=id, schema=ImageFile)
|
|
312
298
|
assert all([filename.endswith(tuple(constants.IMAGE_EXTENSIONS)) for filename in self.filepaths])
|
|
313
299
|
|
|
314
300
|
def __getitem__(self, idx: int) -> dict:
|
|
@@ -332,33 +318,35 @@ class ImageFileDirectoryReader(DirectoryReader):
|
|
|
332
318
|
filepath = self.filepaths[idx]
|
|
333
319
|
filename = os.path.basename(filepath)
|
|
334
320
|
with open(filepath, "rb") as f:
|
|
335
|
-
contents = base64.b64encode(f.read())
|
|
321
|
+
contents = base64.b64encode(f.read()).decode("utf-8")
|
|
336
322
|
|
|
337
323
|
return {"filename": filename, "contents": contents}
|
|
338
324
|
|
|
339
325
|
|
|
340
|
-
class
|
|
326
|
+
class PDFFileDataset(BaseFileDataset):
|
|
341
327
|
"""
|
|
342
|
-
|
|
328
|
+
PDFFileDataset returns a dictionary for each PDF file in a directory. Each dictionary contains the
|
|
343
329
|
filename, raw PDF content, and parsed text content of a single PDF file in the directory.
|
|
344
330
|
|
|
345
331
|
This class also uses one of a predefined set of PDF processors to extract text content from the PDF files.
|
|
346
332
|
"""
|
|
347
333
|
def __init__(
|
|
348
334
|
self,
|
|
335
|
+
id: str,
|
|
349
336
|
path: str,
|
|
350
337
|
pdfprocessor: str = "pypdf",
|
|
351
338
|
file_cache_dir: str = "/tmp",
|
|
352
339
|
) -> None:
|
|
353
340
|
"""
|
|
354
|
-
Constructor for the `
|
|
341
|
+
Constructor for the `PDFFileDataset` class. The `schema` is set to the `PDFFile` schema.
|
|
355
342
|
|
|
356
343
|
Args:
|
|
344
|
+
id (str): a string identifier for the `Dataset`
|
|
357
345
|
path (str): The path to the directory
|
|
358
346
|
pdfprocessor (str): The PDF processor to use for extracting text content from the PDF files
|
|
359
347
|
file_cache_dir (str): The directory to store the temporary files generated during PDF processing
|
|
360
348
|
"""
|
|
361
|
-
super().__init__(path=path, schema=PDFFile)
|
|
349
|
+
super().__init__(path=path, id=id, schema=PDFFile)
|
|
362
350
|
assert all([filename.endswith(tuple(constants.PDF_EXTENSIONS)) for filename in self.filepaths])
|
|
363
351
|
self.pdfprocessor = pdfprocessor
|
|
364
352
|
self.file_cache_dir = file_cache_dir
|
|
@@ -394,19 +382,20 @@ class PDFFileDirectoryReader(DirectoryReader):
|
|
|
394
382
|
return {"filename": pdf_filename, "contents": pdf_bytes, "text_contents": text_content}
|
|
395
383
|
|
|
396
384
|
|
|
397
|
-
class
|
|
385
|
+
class TextFileDataset(BaseFileDataset):
|
|
398
386
|
"""
|
|
399
|
-
|
|
387
|
+
TextFileDataset returns a dictionary for each text file in a directory. Each dictionary contains the
|
|
400
388
|
filename and contents of a single text file in the directory.
|
|
401
389
|
"""
|
|
402
|
-
def __init__(self, path: str) -> None:
|
|
390
|
+
def __init__(self, id: str, path: str) -> None:
|
|
403
391
|
"""
|
|
404
|
-
Constructor for the `
|
|
392
|
+
Constructor for the `TextFileDataset` class. The `schema` is set to the `TextFile` schema.
|
|
405
393
|
|
|
406
394
|
Args:
|
|
395
|
+
id (str): a string identifier for the `Dataset`
|
|
407
396
|
path (str): The path to the directory
|
|
408
397
|
"""
|
|
409
|
-
super().__init__(path=path, schema=TextFile)
|
|
398
|
+
super().__init__(path=path, id=id, schema=TextFile)
|
|
410
399
|
|
|
411
400
|
def __getitem__(self, idx: int) -> dict:
|
|
412
401
|
"""
|
|
@@ -433,16 +422,16 @@ class TextFileDirectoryReader(DirectoryReader):
|
|
|
433
422
|
return {"filename": filename, "contents": contents}
|
|
434
423
|
|
|
435
424
|
|
|
436
|
-
class
|
|
425
|
+
class XLSFileDataset(BaseFileDataset):
|
|
437
426
|
"""
|
|
438
|
-
|
|
427
|
+
XLSFileDataset returns a dictionary for each XLS file in a directory. Each dictionary contains the
|
|
439
428
|
filename, contents, sheet names, and the number of sheets for a single XLS file in the directory.
|
|
440
429
|
"""
|
|
441
|
-
def __init__(self, path: str) -> None:
|
|
430
|
+
def __init__(self, id: str, path: str) -> None:
|
|
442
431
|
"""
|
|
443
|
-
Constructor for the `
|
|
432
|
+
Constructor for the `XLSFileDataset` class. The `schema` is set to the `XLSFile` schema.
|
|
444
433
|
"""
|
|
445
|
-
super().__init__(path=path, schema=XLSFile)
|
|
434
|
+
super().__init__(path=path, id=id, schema=XLSFile)
|
|
446
435
|
assert all([filename.endswith(tuple(constants.XLS_EXTENSIONS)) for filename in self.filepaths])
|
|
447
436
|
|
|
448
437
|
def __getitem__(self, idx: int) -> dict:
|
|
@@ -478,3 +467,90 @@ class XLSFileDirectoryReader(DirectoryReader):
|
|
|
478
467
|
"sheet_names": xls.sheet_names,
|
|
479
468
|
"number_sheets": len(xls.sheet_names),
|
|
480
469
|
}
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
class AudioFileDataset(BaseFileDirectoryDataset):
|
|
473
|
+
"""
|
|
474
|
+
AudioFileDataset returns a dictionary for each audio file in a directory. Each dictionary contains the
|
|
475
|
+
filename and the base64 encoded bytes content of a single audio file in the directory.
|
|
476
|
+
"""
|
|
477
|
+
def __init__(self, id: str, path: str) -> None:
|
|
478
|
+
"""
|
|
479
|
+
Constructor for the `AudioFileDataset` class. The `schema` is set to the `AudioFile` schema.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
id (str): a string identifier for the `Dataset`
|
|
483
|
+
path (str): The path to the directory
|
|
484
|
+
"""
|
|
485
|
+
super().__init__(path=path, id=id, schema=AudioFile)
|
|
486
|
+
assert all([filename.endswith(tuple(constants.AUDIO_EXTENSIONS)) for filename in self.filepaths])
|
|
487
|
+
|
|
488
|
+
def __getitem__(self, idx: int) -> dict:
|
|
489
|
+
"""
|
|
490
|
+
Returns a dictionary with the filename and base64 encoded bytes content of the audio file at the
|
|
491
|
+
specified `idx`.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
idx (int): The index of the item to return
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
dict: A dictionary with the filename and base64 encoded bytes content of the audio file.
|
|
498
|
+
|
|
499
|
+
.. code-block:: python
|
|
500
|
+
|
|
501
|
+
{
|
|
502
|
+
"filename": "audio.wav",
|
|
503
|
+
"contents": b"base64 encoded audio content here",
|
|
504
|
+
}
|
|
505
|
+
"""
|
|
506
|
+
filepath = self.filepaths[idx]
|
|
507
|
+
filename = os.path.basename(filepath)
|
|
508
|
+
with open(filepath, "rb") as f:
|
|
509
|
+
contents = base64.b64encode(f.read()).decode("utf-8")
|
|
510
|
+
|
|
511
|
+
return {"filename": filename, "contents": contents}
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def get_local_source(id: str, path: str | Path, **kwargs) -> dataset.Dataset:
|
|
515
|
+
"""Return a `Dataset` for a local file or directory."""
|
|
516
|
+
if os.path.isfile(path):
|
|
517
|
+
return TextFileDataset(id, path)
|
|
518
|
+
|
|
519
|
+
elif os.path.isdir(path):
|
|
520
|
+
if all([f.endswith(tuple(constants.IMAGE_EXTENSIONS)) for f in os.listdir(path)]):
|
|
521
|
+
return ImageFileDataset(id, path)
|
|
522
|
+
|
|
523
|
+
elif all([f.endswith(tuple(constants.PDF_EXTENSIONS)) for f in os.listdir(path)]):
|
|
524
|
+
pdfprocessor = kwargs.get("pdfprocessor", constants.DEFAULT_PDF_PROCESSOR)
|
|
525
|
+
file_cache_dir = kwargs.get("file_cache_dir", "/tmp")
|
|
526
|
+
return PDFFileDataset(
|
|
527
|
+
id=id, path=path, pdfprocessor=pdfprocessor, file_cache_dir=file_cache_dir
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
elif all([f.endswith(tuple(constants.XLS_EXTENSIONS)) for f in os.listdir(path)]):
|
|
531
|
+
return XLSFileDataset(id, path)
|
|
532
|
+
|
|
533
|
+
elif all([f.endswith(tuple(constants.HTML_EXTENSIONS)) for f in os.listdir(path)]):
|
|
534
|
+
return HTMLFileDataset(id, path)
|
|
535
|
+
|
|
536
|
+
else:
|
|
537
|
+
return TextFileDataset(id, path)
|
|
538
|
+
else:
|
|
539
|
+
raise ValueError(f"Path {path} is invalid. Does not point to a file or directory.")
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def resolve_datasource(id: str, source: str | Path | list | pd.DataFrame, **kwargs) -> dataset.Dataset:
|
|
543
|
+
"""
|
|
544
|
+
This helper function returns a `Dataset` object based on the `source` type.
|
|
545
|
+
The returned `Dataset` object is guaranteed to have a schema.
|
|
546
|
+
"""
|
|
547
|
+
if isinstance(source, (str, Path)):
|
|
548
|
+
source = get_local_source(id, source, **kwargs)
|
|
549
|
+
|
|
550
|
+
elif isinstance(source, (list, pd.DataFrame)):
|
|
551
|
+
source = MemoryDataset(id=id, vals=source)
|
|
552
|
+
|
|
553
|
+
else:
|
|
554
|
+
raise ValueError(f"Invalid source type: {type(source)}, We only support str, Path, list[dict], and pd.DataFrame")
|
|
555
|
+
|
|
556
|
+
return source
|
|
@@ -2,10 +2,12 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from palimpzest.core.lib.schemas import OperatorDerivedSchema, Schema
|
|
5
|
+
from pydantic import BaseModel
|
|
7
6
|
|
|
7
|
+
from palimpzest.core.lib.schemas import create_schema_from_fields
|
|
8
8
|
|
|
9
|
+
|
|
10
|
+
# TODO: need to rethink how group bys work
|
|
9
11
|
# signature for a group by aggregate that applies
|
|
10
12
|
# group and aggregation to an input tuple
|
|
11
13
|
class GroupBySig:
|
|
@@ -14,12 +16,12 @@ class GroupBySig:
|
|
|
14
16
|
self.agg_funcs = agg_funcs
|
|
15
17
|
self.agg_fields = agg_fields
|
|
16
18
|
|
|
17
|
-
def validate_schema(self, input_schema:
|
|
19
|
+
def validate_schema(self, input_schema: BaseModel) -> tuple[bool, str | None]:
|
|
18
20
|
for f in self.group_by_fields:
|
|
19
|
-
if not
|
|
21
|
+
if f not in input_schema.model_fields:
|
|
20
22
|
return (False, "Supplied schema has no field " + f)
|
|
21
23
|
for f in self.agg_fields:
|
|
22
|
-
if not
|
|
24
|
+
if f not in input_schema.model_fields:
|
|
23
25
|
return (False, "Supplied schema has no field " + f)
|
|
24
26
|
return (True, None)
|
|
25
27
|
|
|
@@ -48,16 +50,17 @@ class GroupBySig:
|
|
|
48
50
|
ops.append(self.agg_funcs[i] + "(" + self.agg_fields[i] + ")")
|
|
49
51
|
return ops
|
|
50
52
|
|
|
51
|
-
def output_schema(self) -> type[
|
|
53
|
+
def output_schema(self) -> type[BaseModel]:
|
|
52
54
|
# the output class varies depending on the group by, so here
|
|
53
55
|
# we dynamically construct this output
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
fields = []
|
|
56
57
|
for g in self.group_by_fields:
|
|
57
|
-
f =
|
|
58
|
-
|
|
58
|
+
f = {"name": g, "type": Any, "desc": f"Group by field: {g}"}
|
|
59
|
+
fields.append(f)
|
|
60
|
+
|
|
59
61
|
ops = self.get_agg_field_names()
|
|
60
62
|
for op in ops:
|
|
61
|
-
f =
|
|
62
|
-
|
|
63
|
-
|
|
63
|
+
f = {"name": op, "type": Any, "desc": f"Aggregate field: {op}"}
|
|
64
|
+
fields.append(f)
|
|
65
|
+
|
|
66
|
+
return create_schema_from_fields(fields)
|