palimpzest 0.5.4__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +7 -9
- palimpzest/constants.py +47 -7
- palimpzest/core/__init__.py +20 -26
- palimpzest/core/data/dataclasses.py +9 -2
- palimpzest/core/data/datareaders.py +497 -0
- palimpzest/core/elements/records.py +29 -37
- palimpzest/core/lib/fields.py +14 -12
- palimpzest/core/lib/schemas.py +80 -94
- palimpzest/policy.py +58 -0
- palimpzest/prompts/__init__.py +22 -0
- palimpzest/prompts/code_synthesis_prompts.py +28 -0
- palimpzest/prompts/convert_prompts.py +87 -0
- palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
- palimpzest/prompts/filter_prompts.py +69 -0
- palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
- palimpzest/prompts/prompt_factory.py +732 -0
- palimpzest/prompts/util_phrases.py +14 -0
- palimpzest/query/execution/execution_strategy.py +0 -3
- palimpzest/query/execution/parallel_execution_strategy.py +12 -25
- palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
- palimpzest/query/generators/generators.py +71 -347
- palimpzest/query/operators/__init__.py +5 -5
- palimpzest/query/operators/aggregate.py +10 -5
- palimpzest/query/operators/code_synthesis_convert.py +4 -48
- palimpzest/query/operators/convert.py +5 -2
- palimpzest/query/operators/critique_and_refine_convert.py +112 -0
- palimpzest/query/operators/filter.py +1 -1
- palimpzest/query/operators/limit.py +1 -1
- palimpzest/query/operators/logical.py +28 -27
- palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
- palimpzest/query/operators/physical.py +32 -20
- palimpzest/query/operators/project.py +1 -1
- palimpzest/query/operators/rag_convert.py +6 -3
- palimpzest/query/operators/retrieve.py +13 -31
- palimpzest/query/operators/scan.py +150 -0
- palimpzest/query/optimizer/__init__.py +5 -1
- palimpzest/query/optimizer/cost_model.py +18 -34
- palimpzest/query/optimizer/optimizer.py +40 -25
- palimpzest/query/optimizer/optimizer_strategy.py +26 -0
- palimpzest/query/optimizer/plan.py +2 -2
- palimpzest/query/optimizer/rules.py +118 -27
- palimpzest/query/processor/config.py +12 -1
- palimpzest/query/processor/mab_sentinel_processor.py +125 -112
- palimpzest/query/processor/nosentinel_processor.py +46 -62
- palimpzest/query/processor/query_processor.py +10 -20
- palimpzest/query/processor/query_processor_factory.py +12 -5
- palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
- palimpzest/query/processor/streaming_processor.py +11 -17
- palimpzest/sets.py +170 -94
- palimpzest/tools/pdfparser.py +5 -64
- palimpzest/utils/datareader_helpers.py +61 -0
- palimpzest/utils/field_helpers.py +69 -0
- palimpzest/utils/hash_helpers.py +3 -2
- palimpzest/utils/udfs.py +0 -28
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/METADATA +49 -49
- palimpzest-0.6.1.dist-info/RECORD +87 -0
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/top_level.txt +0 -1
- cli/README.md +0 -156
- cli/__init__.py +0 -0
- cli/cli_main.py +0 -390
- palimpzest/config.py +0 -89
- palimpzest/core/data/datasources.py +0 -369
- palimpzest/datamanager/__init__.py +0 -0
- palimpzest/datamanager/datamanager.py +0 -300
- palimpzest/prompts.py +0 -397
- palimpzest/query/operators/datasource.py +0 -202
- palimpzest-0.5.4.dist-info/RECORD +0 -83
- palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/LICENSE +0 -0
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/WHEEL +0 -0
palimpzest/sets.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
from pathlib import Path
|
|
4
4
|
from typing import Callable
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
8
|
from palimpzest.constants import AggFunc, Cardinality
|
|
9
|
-
from palimpzest.core.data.
|
|
9
|
+
from palimpzest.core.data.datareaders import DataReader
|
|
10
10
|
from palimpzest.core.elements.filters import Filter
|
|
11
11
|
from palimpzest.core.elements.groupbysig import GroupBySig
|
|
12
|
-
from palimpzest.core.lib.
|
|
13
|
-
from palimpzest.
|
|
12
|
+
from palimpzest.core.lib.fields import ListField, StringField
|
|
13
|
+
from palimpzest.core.lib.schemas import Number, Schema
|
|
14
|
+
from palimpzest.policy import construct_policy_from_kwargs
|
|
14
15
|
from palimpzest.query.processor.config import QueryProcessorConfig
|
|
15
|
-
from palimpzest.utils.
|
|
16
|
+
from palimpzest.utils.datareader_helpers import get_local_datareader
|
|
17
|
+
from palimpzest.utils.hash_helpers import hash_for_serialized_dict
|
|
16
18
|
from palimpzest.utils.index_helpers import get_index_str
|
|
17
19
|
|
|
18
20
|
|
|
@@ -21,27 +23,11 @@ from palimpzest.utils.index_helpers import get_index_str
|
|
|
21
23
|
#####################################################
|
|
22
24
|
class Set:
|
|
23
25
|
"""
|
|
24
|
-
A Set is the logical abstraction for a set of DataRecords matching some Schema. It is
|
|
25
|
-
also a node in the computation graph of a Dataset.
|
|
26
|
-
|
|
27
|
-
Each Dataset consists of one or more Sets. The "initial" Set in a Dataset can be thought
|
|
28
|
-
of as the Set that results from reading each DataRecord unaltered from the source. For each
|
|
29
|
-
filter or transformation that is applied to the Dataset, a new Set is created which defines
|
|
30
|
-
the set of DataRecords that result from applying that filter or transformation. In brief,
|
|
31
|
-
the Sets define a Dataset's computation graph. Sets can also be cached to maximize the reuse
|
|
32
|
-
of past computation.
|
|
33
|
-
|
|
34
|
-
Sets are initialized with a dataset_id, a schema, and a source. The source is either an
|
|
35
|
-
existing Set or a raw data source (such as a directory or S3 prefix). Sets may be initialized
|
|
36
|
-
with a Filter (which defines the filtering performed on the source to obtain *this* Set),
|
|
37
|
-
and a description of what this Set is meant to represent.
|
|
38
26
|
"""
|
|
39
27
|
|
|
40
|
-
SET_VERSION = 0.1
|
|
41
|
-
|
|
42
28
|
def __init__(
|
|
43
29
|
self,
|
|
44
|
-
source: Set |
|
|
30
|
+
source: Set | DataReader,
|
|
45
31
|
schema: Schema,
|
|
46
32
|
desc: str | None = None,
|
|
47
33
|
filter: Filter | None = None,
|
|
@@ -49,10 +35,11 @@ class Set:
|
|
|
49
35
|
agg_func: AggFunc | None = None,
|
|
50
36
|
group_by: GroupBySig | None = None,
|
|
51
37
|
project_cols: list[str] | None = None,
|
|
52
|
-
index
|
|
38
|
+
index=None, # TODO(Siva): Abstract Index and add a type here and elsewhere
|
|
39
|
+
search_func: Callable | None = None,
|
|
53
40
|
search_attr: str | None = None,
|
|
54
41
|
output_attr: str | None = None,
|
|
55
|
-
k: int | None = None,
|
|
42
|
+
k: int | None = None, # TODO: disambiguate `k` to be something like `retrieve_k`
|
|
56
43
|
limit: int | None = None,
|
|
57
44
|
cardinality: Cardinality = Cardinality.ONE_TO_ONE,
|
|
58
45
|
depends_on: list[str] | None = None,
|
|
@@ -67,6 +54,7 @@ class Set:
|
|
|
67
54
|
self._group_by = group_by
|
|
68
55
|
self._project_cols = None if project_cols is None else sorted(project_cols)
|
|
69
56
|
self._index = index
|
|
57
|
+
self._search_func = search_func
|
|
70
58
|
self._search_attr = search_attr
|
|
71
59
|
self._output_attr = output_attr
|
|
72
60
|
self._k = k
|
|
@@ -75,24 +63,22 @@ class Set:
|
|
|
75
63
|
self._depends_on = [] if depends_on is None else sorted(depends_on)
|
|
76
64
|
self._nocache = nocache
|
|
77
65
|
|
|
78
|
-
def __str__(self):
|
|
79
|
-
return (
|
|
80
|
-
f"{self.__class__.__name__}(schema={self.schema}, desc={self._desc}, "
|
|
81
|
-
f"filter={str(self._filter)}, udf={str(self._udf)}, agg_func={str(self._agg_func)}, limit={str(self._limit)}, "
|
|
82
|
-
f"project_cols={str(self._project_cols)}, uid={self.universal_identifier()})"
|
|
83
|
-
)
|
|
84
|
-
|
|
85
66
|
@property
|
|
86
67
|
def schema(self) -> Schema:
|
|
87
68
|
return self._schema
|
|
88
69
|
|
|
70
|
+
def _set_data_source(self, source: DataReader):
|
|
71
|
+
if isinstance(self._source, Set):
|
|
72
|
+
self._source._set_data_source(source)
|
|
73
|
+
else:
|
|
74
|
+
self._source = source
|
|
75
|
+
|
|
89
76
|
def serialize(self):
|
|
90
77
|
# NOTE: I needed to remove depends_on from the serialization dictionary because
|
|
91
78
|
# the optimizer changes the name of the depends_on fields to be their "full" name.
|
|
92
79
|
# This created an issue with the node.universal_identifier() not being consistent
|
|
93
80
|
# after changing the field to its full name.
|
|
94
81
|
d = {
|
|
95
|
-
"version": Set.SET_VERSION,
|
|
96
82
|
"schema": self.schema.json_schema(),
|
|
97
83
|
"source": self._source.serialize(),
|
|
98
84
|
"desc": repr(self._desc),
|
|
@@ -104,6 +90,7 @@ class Set:
|
|
|
104
90
|
"group_by": (None if self._group_by is None else self._group_by.serialize()),
|
|
105
91
|
"project_cols": (None if self._project_cols is None else self._project_cols),
|
|
106
92
|
"index": None if self._index is None else get_index_str(self._index),
|
|
93
|
+
"search_func": None if self._search_func is None else str(self._search_func),
|
|
107
94
|
"search_attr": self._search_attr,
|
|
108
95
|
"output_attr": self._output_attr,
|
|
109
96
|
"k": self._k,
|
|
@@ -113,72 +100,56 @@ class Set:
|
|
|
113
100
|
|
|
114
101
|
def universal_identifier(self):
|
|
115
102
|
"""Return a unique identifier for this Set."""
|
|
116
|
-
|
|
117
|
-
ordered = json.dumps(d, sort_keys=True)
|
|
118
|
-
result = hash_for_id(ordered)
|
|
119
|
-
return result
|
|
103
|
+
return hash_for_serialized_dict(self.serialize())
|
|
120
104
|
|
|
121
105
|
def json_schema(self):
|
|
122
106
|
"""Return the JSON schema for this Set."""
|
|
123
107
|
return self.schema.json_schema()
|
|
124
108
|
|
|
125
109
|
|
|
126
|
-
|
|
127
110
|
class Dataset(Set):
|
|
128
111
|
"""
|
|
129
|
-
A Dataset is the intended abstraction for programmers to interact with when
|
|
130
|
-
|
|
131
|
-
Users instantiate a Dataset by specifying a `source` that either points to a
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
a
|
|
136
|
-
lineage of computation on a Dataset, and this enables programmers to re-use
|
|
137
|
-
previously cached computation by providing it as a `source` to some future Dataset.
|
|
112
|
+
A Dataset is the intended abstraction for programmers to interact with when writing PZ programs.
|
|
113
|
+
|
|
114
|
+
Users instantiate a Dataset by specifying a `source` that either points to a DataReader
|
|
115
|
+
or an existing Dataset. Users can then perform computations on the Dataset in a lazy fashion
|
|
116
|
+
by leveraging functions such as `filter`, `sem_filter`, `sem_add_columns`, `aggregate`, etc.
|
|
117
|
+
Underneath the hood, each of these operations creates a new Dataset. As a result, the Dataset
|
|
118
|
+
defines a lineage of computation.
|
|
138
119
|
"""
|
|
139
120
|
|
|
140
|
-
def __init__(
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
121
|
+
def __init__(
|
|
122
|
+
self,
|
|
123
|
+
source: str | Path | list | pd.DataFrame | DataReader | Dataset,
|
|
124
|
+
schema: Schema | None = None,
|
|
125
|
+
*args,
|
|
126
|
+
**kwargs,
|
|
127
|
+
) -> None:
|
|
128
|
+
# NOTE: this function currently assumes that DataReader will always be provided with a schema;
|
|
129
|
+
# we will relax this assumption in a subsequent PR
|
|
130
|
+
# convert source into a DataReader
|
|
131
|
+
updated_source = get_local_datareader(source, **kwargs) if isinstance(source, (str, Path, list, pd.DataFrame)) else source
|
|
132
|
+
|
|
133
|
+
# get the schema
|
|
134
|
+
schema = updated_source.schema if schema is None else schema
|
|
135
|
+
|
|
145
136
|
# intialize class
|
|
146
137
|
super().__init__(updated_source, schema, *args, **kwargs)
|
|
147
138
|
|
|
148
|
-
def copy(self) -> Dataset:
|
|
149
|
-
source_copy = self._source.copy()
|
|
150
|
-
dataset_copy = Dataset(
|
|
151
|
-
schema=self.schema,
|
|
152
|
-
source=source_copy,
|
|
153
|
-
desc=self._desc,
|
|
154
|
-
filter=self._filter,
|
|
155
|
-
udf=self._udf,
|
|
156
|
-
agg_func=self._agg_func,
|
|
157
|
-
group_by=self._group_by,
|
|
158
|
-
index=self._index,
|
|
159
|
-
search_attr=self._search_attr,
|
|
160
|
-
output_attr=self._output_attr,
|
|
161
|
-
k=self._k,
|
|
162
|
-
limit=self._limit,
|
|
163
|
-
cardinality=self._cardinality,
|
|
164
|
-
depends_on=self._depends_on,
|
|
165
|
-
nocache=self._nocache,
|
|
166
|
-
)
|
|
167
|
-
return dataset_copy
|
|
168
|
-
|
|
169
139
|
def filter(
|
|
170
140
|
self,
|
|
171
|
-
_filter:
|
|
141
|
+
_filter: Callable,
|
|
172
142
|
depends_on: str | list[str] | None = None,
|
|
173
143
|
) -> Dataset:
|
|
174
|
-
"""Add a filter to the Set. This filter will possibly restrict the items that are returned later."""
|
|
144
|
+
"""Add a user defined function as a filter to the Set. This filter will possibly restrict the items that are returned later."""
|
|
175
145
|
f = None
|
|
176
|
-
if
|
|
177
|
-
f = Filter(_filter)
|
|
178
|
-
elif callable(_filter):
|
|
146
|
+
if callable(_filter):
|
|
179
147
|
f = Filter(filter_fn=_filter)
|
|
180
148
|
else:
|
|
181
|
-
|
|
149
|
+
error_str = f"Only support callable for filter, currently got {type(_filter)}"
|
|
150
|
+
if isinstance(_filter, str):
|
|
151
|
+
error_str += ". Consider using sem_filter() for semantic filters."
|
|
152
|
+
raise Exception(error_str)
|
|
182
153
|
|
|
183
154
|
if isinstance(depends_on, str):
|
|
184
155
|
depends_on = [depends_on]
|
|
@@ -190,33 +161,115 @@ class Dataset(Set):
|
|
|
190
161
|
depends_on=depends_on,
|
|
191
162
|
nocache=self._nocache,
|
|
192
163
|
)
|
|
193
|
-
|
|
194
|
-
def
|
|
164
|
+
|
|
165
|
+
def sem_filter(
|
|
195
166
|
self,
|
|
196
|
-
|
|
197
|
-
udf: Callable | None = None,
|
|
198
|
-
cardinality: Cardinality = Cardinality.ONE_TO_ONE,
|
|
167
|
+
_filter: str,
|
|
199
168
|
depends_on: str | list[str] | None = None,
|
|
200
|
-
desc: str = "Convert to new schema",
|
|
201
169
|
) -> Dataset:
|
|
202
|
-
"""
|
|
170
|
+
"""Add a natural language description of a filter to the Set. This filter will possibly restrict the items that are returned later."""
|
|
171
|
+
f = None
|
|
172
|
+
if isinstance(_filter, str):
|
|
173
|
+
f = Filter(_filter)
|
|
174
|
+
else:
|
|
175
|
+
raise Exception("sem_filter() only supports `str` input for _filter.", type(_filter))
|
|
176
|
+
|
|
203
177
|
if isinstance(depends_on, str):
|
|
204
178
|
depends_on = [depends_on]
|
|
205
179
|
|
|
206
180
|
return Dataset(
|
|
207
181
|
source=self,
|
|
208
|
-
schema=
|
|
209
|
-
|
|
182
|
+
schema=self.schema,
|
|
183
|
+
filter=f,
|
|
184
|
+
depends_on=depends_on,
|
|
185
|
+
nocache=self._nocache,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def sem_add_columns(self, cols: list[dict] | type[Schema],
|
|
189
|
+
cardinality: Cardinality = Cardinality.ONE_TO_ONE,
|
|
190
|
+
depends_on: str | list[str] | None = None,
|
|
191
|
+
desc: str = "Add new columns via semantic reasoning") -> Dataset:
|
|
192
|
+
"""
|
|
193
|
+
Add new columns by specifying the column names, descriptions, and types.
|
|
194
|
+
The column will be computed during the execution of the Dataset.
|
|
195
|
+
Example:
|
|
196
|
+
sem_add_columns(
|
|
197
|
+
[{'name': 'greeting', 'desc': 'The greeting message', 'type': str},
|
|
198
|
+
{'name': 'age', 'desc': 'The age of the person', 'type': int},
|
|
199
|
+
{'name': 'full_name', 'desc': 'The name of the person', 'type': str}]
|
|
200
|
+
)
|
|
201
|
+
"""
|
|
202
|
+
if isinstance(depends_on, str):
|
|
203
|
+
depends_on = [depends_on]
|
|
204
|
+
|
|
205
|
+
new_output_schema = None
|
|
206
|
+
if isinstance(cols, list):
|
|
207
|
+
new_output_schema = self.schema.add_fields(cols)
|
|
208
|
+
elif issubclass(cols, Schema):
|
|
209
|
+
new_output_schema = self.schema.union(cols)
|
|
210
|
+
else:
|
|
211
|
+
raise ValueError("`cols` must be a list of dictionaries or a Schema.")
|
|
212
|
+
|
|
213
|
+
return Dataset(
|
|
214
|
+
source=self,
|
|
215
|
+
schema=new_output_schema,
|
|
216
|
+
udf=None,
|
|
210
217
|
cardinality=cardinality,
|
|
211
218
|
depends_on=depends_on,
|
|
212
219
|
desc=desc,
|
|
213
220
|
nocache=self._nocache,
|
|
214
221
|
)
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
222
|
+
|
|
223
|
+
def add_columns(self, udf: Callable,
|
|
224
|
+
cols: list[dict] | type[Schema],
|
|
225
|
+
cardinality: Cardinality = Cardinality.ONE_TO_ONE,
|
|
226
|
+
depends_on: str | list[str] | None = None,
|
|
227
|
+
desc: str = "Add new columns via UDF") -> Dataset:
|
|
228
|
+
"""
|
|
229
|
+
Add new columns by specifying UDFs.
|
|
230
|
+
|
|
231
|
+
Examples:
|
|
232
|
+
add_columns(
|
|
233
|
+
udf=compute_personal_greeting,
|
|
234
|
+
cols=[
|
|
235
|
+
{'name': 'greeting', 'desc': 'The greeting message', 'type': str},
|
|
236
|
+
{'name': 'age', 'desc': 'The age of the person', 'type': int},
|
|
237
|
+
{'name': 'full_name', 'desc': 'The name of the person', 'type': str},
|
|
238
|
+
]
|
|
239
|
+
)
|
|
240
|
+
"""
|
|
241
|
+
if udf is None or cols is None:
|
|
242
|
+
raise ValueError("`udf` and `cols` must be provided for add_columns.")
|
|
243
|
+
|
|
244
|
+
if isinstance(depends_on, str):
|
|
245
|
+
depends_on = [depends_on]
|
|
246
|
+
|
|
247
|
+
new_output_schema = None
|
|
248
|
+
if isinstance(cols, list):
|
|
249
|
+
updated_cols = []
|
|
250
|
+
for col_dict in cols:
|
|
251
|
+
assert isinstance(col_dict, dict), "each entry in `cols` must be a dictionary"
|
|
252
|
+
assert "name" in col_dict, "each type must contain a 'name' key specifying the column name"
|
|
253
|
+
assert "type" in col_dict, "each type must contain a 'type' key specifying the column type"
|
|
254
|
+
col_dict["desc"] = col_dict.get("desc", "New column: " + col_dict["name"])
|
|
255
|
+
updated_cols.append(col_dict)
|
|
256
|
+
new_output_schema = self.schema.add_fields(updated_cols)
|
|
257
|
+
|
|
258
|
+
elif issubclass(cols, Schema):
|
|
259
|
+
new_output_schema = self.schema.union(cols)
|
|
260
|
+
|
|
261
|
+
else:
|
|
262
|
+
raise ValueError("`cols` must be a list of dictionaries or a Schema.")
|
|
263
|
+
|
|
264
|
+
return Dataset(
|
|
265
|
+
source=self,
|
|
266
|
+
schema=new_output_schema,
|
|
267
|
+
udf=udf,
|
|
268
|
+
cardinality=cardinality,
|
|
269
|
+
desc=desc,
|
|
270
|
+
depends_on=depends_on,
|
|
271
|
+
nocache=self._nocache,
|
|
272
|
+
)
|
|
220
273
|
|
|
221
274
|
def count(self) -> Dataset:
|
|
222
275
|
"""Apply a count aggregation to this set"""
|
|
@@ -247,12 +300,27 @@ class Dataset(Set):
|
|
|
247
300
|
nocache=self._nocache,
|
|
248
301
|
)
|
|
249
302
|
|
|
250
|
-
def retrieve(
|
|
303
|
+
def retrieve(
|
|
304
|
+
self, index, search_func: Callable, search_attr: str, output_attr: str, output_attr_desc: str, k=-1
|
|
305
|
+
) -> Dataset:
|
|
306
|
+
"""
|
|
307
|
+
Retrieve the top k nearest neighbors of the value of the `search_attr` from the index and
|
|
308
|
+
stores it in the `output_attr` field. The output schema is a union of the current schema
|
|
309
|
+
and the `output_attr` with type ListField(StringField). `search_func` is a function of
|
|
310
|
+
type (index, query: str | list(str), k: int) -> list[str]. It should implement the lookup
|
|
311
|
+
logic for the index and return the top k results. The value of the `search_attr` field is
|
|
312
|
+
used as the query to lookup in the index. The results are stored in the `output_attr`
|
|
313
|
+
field. `output_attr_desc` is the description of the `output_attr` field.
|
|
314
|
+
"""
|
|
315
|
+
# Output schema is a union of the current schema and the output_attr
|
|
316
|
+
attributes = {output_attr: ListField(StringField)(desc=output_attr_desc)}
|
|
317
|
+
output_schema = self.schema().union(type("Schema", (Schema,), attributes))
|
|
251
318
|
return Dataset(
|
|
252
319
|
source=self,
|
|
253
320
|
schema=output_schema,
|
|
254
321
|
desc="Retrieve",
|
|
255
322
|
index=index,
|
|
323
|
+
search_func=search_func,
|
|
256
324
|
search_attr=search_attr,
|
|
257
325
|
output_attr=output_attr,
|
|
258
326
|
k=k,
|
|
@@ -278,6 +346,14 @@ class Dataset(Set):
|
|
|
278
346
|
nocache=self._nocache,
|
|
279
347
|
)
|
|
280
348
|
|
|
281
|
-
def run(self, config: QueryProcessorConfig | None = None, **kwargs):
|
|
349
|
+
def run(self, config: QueryProcessorConfig | None = None, **kwargs):
|
|
350
|
+
"""Invoke the QueryProcessor to execute the query. `kwargs` will be applied to the QueryProcessorConfig."""
|
|
351
|
+
# TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR
|
|
282
352
|
from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory
|
|
353
|
+
|
|
354
|
+
# as syntactic sugar, we will allow some keyword arguments to parameterize our policies
|
|
355
|
+
policy = construct_policy_from_kwargs(**kwargs)
|
|
356
|
+
if policy is not None:
|
|
357
|
+
kwargs["policy"] = policy
|
|
358
|
+
|
|
283
359
|
return QueryProcessorFactory.create_and_run_processor(self, config, **kwargs)
|
palimpzest/tools/pdfparser.py
CHANGED
|
@@ -3,7 +3,7 @@ import io
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import time
|
|
6
|
-
from typing import BinaryIO
|
|
6
|
+
from typing import BinaryIO
|
|
7
7
|
from zipfile import ZipFile
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
@@ -11,32 +11,9 @@ import requests
|
|
|
11
11
|
from fastapi import status
|
|
12
12
|
from pypdf import PdfReader
|
|
13
13
|
|
|
14
|
-
from palimpzest.config import Config
|
|
15
|
-
|
|
16
14
|
COSMOS_ADDRESS = "https://xdd.wisc.edu/cosmos_service"
|
|
17
15
|
|
|
18
16
|
|
|
19
|
-
class PdfParser:
|
|
20
|
-
def __init__(self, pdf_path: str):
|
|
21
|
-
self.pdf_path = pdf_path
|
|
22
|
-
with open(pdf_path, "rb") as f:
|
|
23
|
-
self.pdf = f.read()
|
|
24
|
-
self.text = ""
|
|
25
|
-
self.pages = []
|
|
26
|
-
self._parse()
|
|
27
|
-
|
|
28
|
-
def _parse(self):
|
|
29
|
-
for page in self.pdf:
|
|
30
|
-
self.text += page.get_text() # type: ignore
|
|
31
|
-
self.pages.append(page.get_text()) # type: ignore
|
|
32
|
-
|
|
33
|
-
def get_text(self) -> str:
|
|
34
|
-
return self.text
|
|
35
|
-
|
|
36
|
-
def get_pages(self) -> List[str]:
|
|
37
|
-
return self.pages
|
|
38
|
-
|
|
39
|
-
|
|
40
17
|
def get_md5(file_bytes: bytes) -> str:
|
|
41
18
|
if not isinstance(file_bytes, bytes):
|
|
42
19
|
file_bytes = file_bytes.encode()
|
|
@@ -209,15 +186,9 @@ def cosmos_client(name: str, data: BinaryIO, output_dir: str, delay=10):
|
|
|
209
186
|
# 1. Check if the text file already exists in the cache, if so, read from the cache
|
|
210
187
|
# 2. If not, call the cosmos_client function to process the PDF file and cache the text file
|
|
211
188
|
##
|
|
212
|
-
# NOTE: I don't believe anyone actively depends on this function, but we need to remove the
|
|
213
|
-
# dependency on DataDirectory() in order to prevent circular imports. The long-term solution
|
|
214
|
-
# is to separate out the pieces of DataDirectory which the DataSources depend on, from the
|
|
215
|
-
# pieces which are related to setting / reading external configurations (like "pdfprocessor").
|
|
216
|
-
# However, given that I can fix this in two minutes by adding this is a kwarg, I'm going to
|
|
217
|
-
# do that for now and revisit the issue if/when this matters.
|
|
218
189
|
|
|
219
190
|
# TODO(Jun): 1. cosmos returns 202 for me. 2. why only accept "pypdf" and "cosmos" as pdfprocessor?
|
|
220
|
-
def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="
|
|
191
|
+
def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="pypdf", enable_file_cache=True, file_cache_dir="/tmp"):
|
|
221
192
|
pdf_filename = filename
|
|
222
193
|
file_name = os.path.basename(pdf_filename)
|
|
223
194
|
file_name_without_extension = os.path.splitext(file_name)[0]
|
|
@@ -229,11 +200,12 @@ def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="cosmos", enable_file_ca
|
|
|
229
200
|
for page in pdf.pages:
|
|
230
201
|
all_text += page.extract_text() + "\n"
|
|
231
202
|
return all_text
|
|
232
|
-
|
|
203
|
+
|
|
233
204
|
else:
|
|
234
205
|
# Get md5 of the pdf_bytes
|
|
235
206
|
md5 = get_md5(pdf_bytes)
|
|
236
207
|
cached_extraction_folder = f"COSMOS_{os.path.splitext(file_name)[0].replace(' ', '_')}_{md5}"
|
|
208
|
+
|
|
237
209
|
# Check if pz_file_cache_dir exists in the file system
|
|
238
210
|
pz_file_cache_dir = os.path.join(file_cache_dir, cached_extraction_folder)
|
|
239
211
|
if enable_file_cache and os.path.exists(pz_file_cache_dir):
|
|
@@ -243,43 +215,12 @@ def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="cosmos", enable_file_ca
|
|
|
243
215
|
text_content = file.read()
|
|
244
216
|
return text_content
|
|
245
217
|
|
|
246
|
-
#
|
|
247
|
-
# CHUNWEI: This code has a bug
|
|
248
|
-
# It checks to see if the text file name is in the registry, but there are two things wrong here.
|
|
249
|
-
# 1) The registry is for 'official' datasets that have been inserted by the user, not cached objects.
|
|
250
|
-
# 2) The filename isn't enough to check for cached results. Maybe the file moved directories, or maybe there are
|
|
251
|
-
# multiple different files with the same name. You need the checksum of the original file to ensure the cached
|
|
252
|
-
# object is valid.
|
|
253
|
-
#
|
|
254
|
-
# if DataDirectory().exists(text_file_name):
|
|
255
|
-
# print(f"Text file {text_file_name} already exists, reading from cache")
|
|
256
|
-
# text_file_path = DataDirectory().get_path(text_file_name)
|
|
257
|
-
# with open(text_file_path, 'r') as file:
|
|
258
|
-
# text_content = file.read()
|
|
259
|
-
# return text_content
|
|
260
|
-
# cosmos_file_dir = file_name_without_extension.replace(" ", "_")
|
|
261
|
-
# get a tmp of the system temp directory
|
|
262
|
-
|
|
263
|
-
print(f"Processing {file_name} through COSMOS")
|
|
264
218
|
# Call the cosmos_client function
|
|
219
|
+
print(f"Processing {file_name} through COSMOS")
|
|
265
220
|
cosmos_client(file_name, pdf_bytes, file_cache_dir)
|
|
266
221
|
text_file_path = os.path.join(pz_file_cache_dir, text_file_name)
|
|
267
222
|
if not os.path.exists(text_file_path):
|
|
268
223
|
raise FileNotFoundError(f"Text file {text_file_name} not found in {pz_file_cache_dir}/{text_file_name}")
|
|
269
|
-
# DataDirectory().register_local_file(text_file_path, text_file_name)
|
|
270
224
|
with open(text_file_path) as file:
|
|
271
225
|
text_content = file.read()
|
|
272
226
|
return text_content
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
if __name__ == "__main__":
|
|
276
|
-
config = Config("default")
|
|
277
|
-
file_path = "../../../testdata/pdfs-tiny/battery.pdf"
|
|
278
|
-
# output_dir = "../../../tests/testFileDirectory/cosmos"
|
|
279
|
-
with open(file_path, "rb") as file:
|
|
280
|
-
text = get_text_from_pdf(file_path, file.read())
|
|
281
|
-
print(text)
|
|
282
|
-
# file_name = os.path.basename(file_path)
|
|
283
|
-
# # Call the cosmos_client function
|
|
284
|
-
# cosmos_client(file_name, file, output_dir)
|
|
285
|
-
# DataDirectory().rm_registered_dataset("sidarthe.annotations.txt")
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from palimpzest import constants
|
|
7
|
+
from palimpzest.core.data.datareaders import (
|
|
8
|
+
DataReader,
|
|
9
|
+
FileReader,
|
|
10
|
+
HTMLFileDirectoryReader,
|
|
11
|
+
ImageFileDirectoryReader,
|
|
12
|
+
MemoryReader,
|
|
13
|
+
PDFFileDirectoryReader,
|
|
14
|
+
TextFileDirectoryReader,
|
|
15
|
+
XLSFileDirectoryReader,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_local_source(path: str | Path, **kwargs) -> DataReader:
|
|
20
|
+
"""Return a DataReader for a local file or directory."""
|
|
21
|
+
if os.path.isfile(path):
|
|
22
|
+
return FileReader(path)
|
|
23
|
+
|
|
24
|
+
elif os.path.isdir(path):
|
|
25
|
+
if all([f.endswith(tuple(constants.IMAGE_EXTENSIONS)) for f in os.listdir(path)]):
|
|
26
|
+
return ImageFileDirectoryReader(path)
|
|
27
|
+
|
|
28
|
+
elif all([f.endswith(tuple(constants.PDF_EXTENSIONS)) for f in os.listdir(path)]):
|
|
29
|
+
pdfprocessor = kwargs.get("pdfprocessor", constants.DEFAULT_PDF_PROCESSOR)
|
|
30
|
+
file_cache_dir = kwargs.get("file_cache_dir", "/tmp")
|
|
31
|
+
return PDFFileDirectoryReader(
|
|
32
|
+
path=path, pdfprocessor=pdfprocessor, file_cache_dir=file_cache_dir
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
elif all([f.endswith(tuple(constants.XLS_EXTENSIONS)) for f in os.listdir(path)]):
|
|
36
|
+
return XLSFileDirectoryReader(path)
|
|
37
|
+
|
|
38
|
+
elif all([f.endswith(tuple(constants.HTML_EXTENSIONS)) for f in os.listdir(path)]):
|
|
39
|
+
return HTMLFileDirectoryReader(path)
|
|
40
|
+
|
|
41
|
+
else:
|
|
42
|
+
return TextFileDirectoryReader(path)
|
|
43
|
+
else:
|
|
44
|
+
raise Exception(f"Path {path} is invalid. Does not point to a file or directory.")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_local_datareader(source: str | Path | list | pd.DataFrame, **kwargs) -> DataReader:
|
|
48
|
+
"""
|
|
49
|
+
This helper function returns a `DataReader` object based on the `source` type.
|
|
50
|
+
The returned `DataReader` object is guaranteed to have a schema.
|
|
51
|
+
"""
|
|
52
|
+
if isinstance(source, (str, Path)):
|
|
53
|
+
source = get_local_source(source, **kwargs)
|
|
54
|
+
|
|
55
|
+
elif isinstance(source, (list, pd.DataFrame)):
|
|
56
|
+
source = MemoryReader(source)
|
|
57
|
+
|
|
58
|
+
else:
|
|
59
|
+
raise Exception(f"Invalid source type: {type(source)}, We only support str, Path, list[dict], and pd.DataFrame")
|
|
60
|
+
|
|
61
|
+
return source
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import types
|
|
2
|
+
|
|
3
|
+
from palimpzest.core.lib.fields import (
|
|
4
|
+
BooleanField,
|
|
5
|
+
BytesField,
|
|
6
|
+
Field,
|
|
7
|
+
FloatField,
|
|
8
|
+
IntField,
|
|
9
|
+
ListField,
|
|
10
|
+
NumericField,
|
|
11
|
+
StringField,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def assert_valid_field_type(field_type: type | types.UnionType | types.GenericAlias | Field) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Assert that the field is a valid field type. Return "pz_type" if field_type is a PZ type
|
|
18
|
+
and "python_type" if it is a Python type.
|
|
19
|
+
"""
|
|
20
|
+
try:
|
|
21
|
+
assert issubclass(field_type, Field), "type must be a Python type or palimpzest.core.lib.fields.Field"
|
|
22
|
+
return "pz_type"
|
|
23
|
+
except Exception:
|
|
24
|
+
assert isinstance(field_type, (type, types.UnionType, types.GenericAlias)), "type must be a Python type or palimpzest.core.lib.fields.Field"
|
|
25
|
+
|
|
26
|
+
return "python_type"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def construct_field_type(field_type: type | types.UnionType | types.GenericAlias | Field, desc: str) -> Field:
|
|
30
|
+
"""Convert a field type and description to the corresponding PZ field.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
type: type for the field (e.g. str, bool, list[int], StringField, etc.)
|
|
34
|
+
desc: description used in the field constructor
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Corresponding Field class
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
ValueError: If the type is not recognized
|
|
41
|
+
"""
|
|
42
|
+
# if field_type is a PZ type, construct and return the field
|
|
43
|
+
if assert_valid_field_type(field_type) == "pz_type":
|
|
44
|
+
return field_type(desc=desc)
|
|
45
|
+
|
|
46
|
+
# otherwise, map the Python type to a PZ type and construct the field
|
|
47
|
+
supported_types_map = {
|
|
48
|
+
str: StringField,
|
|
49
|
+
bool: BooleanField,
|
|
50
|
+
int: IntField,
|
|
51
|
+
float: FloatField,
|
|
52
|
+
int | float: NumericField,
|
|
53
|
+
bytes: BytesField,
|
|
54
|
+
list[str]: ListField(StringField),
|
|
55
|
+
list[bool]: ListField(BooleanField),
|
|
56
|
+
list[int]: ListField(IntField),
|
|
57
|
+
list[float]: ListField(FloatField),
|
|
58
|
+
list[int | float]: ListField(NumericField),
|
|
59
|
+
list[bytes]: ListField(BytesField),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if field_type not in supported_types_map:
|
|
63
|
+
raise ValueError(f"Unsupported type: {field_type}. Supported types are: {list(supported_types_map.keys())}")
|
|
64
|
+
|
|
65
|
+
# get the field class and (if applicable) element field class
|
|
66
|
+
field_cls = supported_types_map[field_type]
|
|
67
|
+
|
|
68
|
+
# construct and return the field
|
|
69
|
+
return field_cls(desc=desc)
|
palimpzest/utils/hash_helpers.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
+
import json
|
|
2
3
|
|
|
3
4
|
from palimpzest.constants import MAX_ID_CHARS
|
|
4
5
|
|
|
@@ -7,5 +8,5 @@ def hash_for_id(id_str: str, max_chars: int = MAX_ID_CHARS) -> str:
|
|
|
7
8
|
return hashlib.sha256(id_str.encode("utf-8")).hexdigest()[:max_chars]
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
def
|
|
11
|
-
return hash_for_id(
|
|
11
|
+
def hash_for_serialized_dict(dict_obj: dict) -> str:
|
|
12
|
+
return hash_for_id(json.dumps(dict_obj, sort_keys=True))
|