palimpzest 0.5.4__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. palimpzest/__init__.py +7 -9
  2. palimpzest/constants.py +47 -7
  3. palimpzest/core/__init__.py +20 -26
  4. palimpzest/core/data/dataclasses.py +9 -2
  5. palimpzest/core/data/datareaders.py +497 -0
  6. palimpzest/core/elements/records.py +29 -37
  7. palimpzest/core/lib/fields.py +14 -12
  8. palimpzest/core/lib/schemas.py +80 -94
  9. palimpzest/policy.py +58 -0
  10. palimpzest/prompts/__init__.py +22 -0
  11. palimpzest/prompts/code_synthesis_prompts.py +28 -0
  12. palimpzest/prompts/convert_prompts.py +87 -0
  13. palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
  14. palimpzest/prompts/filter_prompts.py +69 -0
  15. palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
  16. palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
  17. palimpzest/prompts/prompt_factory.py +732 -0
  18. palimpzest/prompts/util_phrases.py +14 -0
  19. palimpzest/query/execution/execution_strategy.py +0 -3
  20. palimpzest/query/execution/parallel_execution_strategy.py +12 -25
  21. palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
  22. palimpzest/query/generators/generators.py +71 -347
  23. palimpzest/query/operators/__init__.py +5 -5
  24. palimpzest/query/operators/aggregate.py +10 -5
  25. palimpzest/query/operators/code_synthesis_convert.py +4 -48
  26. palimpzest/query/operators/convert.py +5 -2
  27. palimpzest/query/operators/critique_and_refine_convert.py +112 -0
  28. palimpzest/query/operators/filter.py +1 -1
  29. palimpzest/query/operators/limit.py +1 -1
  30. palimpzest/query/operators/logical.py +28 -27
  31. palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
  32. palimpzest/query/operators/physical.py +32 -20
  33. palimpzest/query/operators/project.py +1 -1
  34. palimpzest/query/operators/rag_convert.py +6 -3
  35. palimpzest/query/operators/retrieve.py +13 -31
  36. palimpzest/query/operators/scan.py +150 -0
  37. palimpzest/query/optimizer/__init__.py +5 -1
  38. palimpzest/query/optimizer/cost_model.py +18 -34
  39. palimpzest/query/optimizer/optimizer.py +40 -25
  40. palimpzest/query/optimizer/optimizer_strategy.py +26 -0
  41. palimpzest/query/optimizer/plan.py +2 -2
  42. palimpzest/query/optimizer/rules.py +118 -27
  43. palimpzest/query/processor/config.py +12 -1
  44. palimpzest/query/processor/mab_sentinel_processor.py +125 -112
  45. palimpzest/query/processor/nosentinel_processor.py +46 -62
  46. palimpzest/query/processor/query_processor.py +10 -20
  47. palimpzest/query/processor/query_processor_factory.py +12 -5
  48. palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
  49. palimpzest/query/processor/streaming_processor.py +11 -17
  50. palimpzest/sets.py +170 -94
  51. palimpzest/tools/pdfparser.py +5 -64
  52. palimpzest/utils/datareader_helpers.py +61 -0
  53. palimpzest/utils/field_helpers.py +69 -0
  54. palimpzest/utils/hash_helpers.py +3 -2
  55. palimpzest/utils/udfs.py +0 -28
  56. {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/METADATA +49 -49
  57. palimpzest-0.6.0.dist-info/RECORD +87 -0
  58. {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/top_level.txt +0 -1
  59. cli/README.md +0 -156
  60. cli/__init__.py +0 -0
  61. cli/cli_main.py +0 -390
  62. palimpzest/config.py +0 -89
  63. palimpzest/core/data/datasources.py +0 -369
  64. palimpzest/datamanager/__init__.py +0 -0
  65. palimpzest/datamanager/datamanager.py +0 -300
  66. palimpzest/prompts.py +0 -397
  67. palimpzest/query/operators/datasource.py +0 -202
  68. palimpzest-0.5.4.dist-info/RECORD +0 -83
  69. palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
  70. {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/LICENSE +0 -0
  71. {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/WHEEL +0 -0
palimpzest/sets.py CHANGED
@@ -1,18 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
3
+ from pathlib import Path
4
4
  from typing import Callable
5
5
 
6
6
  import pandas as pd
7
7
 
8
8
  from palimpzest.constants import AggFunc, Cardinality
9
- from palimpzest.core.data.datasources import DataSource
9
+ from palimpzest.core.data.datareaders import DataReader
10
10
  from palimpzest.core.elements.filters import Filter
11
11
  from palimpzest.core.elements.groupbysig import GroupBySig
12
- from palimpzest.core.lib.schemas import DefaultSchema, Number, Schema
13
- from palimpzest.datamanager.datamanager import DataDirectory
12
+ from palimpzest.core.lib.fields import ListField, StringField
13
+ from palimpzest.core.lib.schemas import Number, Schema
14
+ from palimpzest.policy import construct_policy_from_kwargs
14
15
  from palimpzest.query.processor.config import QueryProcessorConfig
15
- from palimpzest.utils.hash_helpers import hash_for_id
16
+ from palimpzest.utils.datareader_helpers import get_local_datareader
17
+ from palimpzest.utils.hash_helpers import hash_for_serialized_dict
16
18
  from palimpzest.utils.index_helpers import get_index_str
17
19
 
18
20
 
@@ -21,27 +23,11 @@ from palimpzest.utils.index_helpers import get_index_str
21
23
  #####################################################
22
24
  class Set:
23
25
  """
24
- A Set is the logical abstraction for a set of DataRecords matching some Schema. It is
25
- also a node in the computation graph of a Dataset.
26
-
27
- Each Dataset consists of one or more Sets. The "initial" Set in a Dataset can be thought
28
- of as the Set that results from reading each DataRecord unaltered from the source. For each
29
- filter or transformation that is applied to the Dataset, a new Set is created which defines
30
- the set of DataRecords that result from applying that filter or transformation. In brief,
31
- the Sets define a Dataset's computation graph. Sets can also be cached to maximize the reuse
32
- of past computation.
33
-
34
- Sets are initialized with a dataset_id, a schema, and a source. The source is either an
35
- existing Set or a raw data source (such as a directory or S3 prefix). Sets may be initialized
36
- with a Filter (which defines the filtering performed on the source to obtain *this* Set),
37
- and a description of what this Set is meant to represent.
38
26
  """
39
27
 
40
- SET_VERSION = 0.1
41
-
42
28
  def __init__(
43
29
  self,
44
- source: Set | DataSource,
30
+ source: Set | DataReader,
45
31
  schema: Schema,
46
32
  desc: str | None = None,
47
33
  filter: Filter | None = None,
@@ -49,10 +35,11 @@ class Set:
49
35
  agg_func: AggFunc | None = None,
50
36
  group_by: GroupBySig | None = None,
51
37
  project_cols: list[str] | None = None,
52
- index = None, # TODO(Siva): Abstract Index and add a type here and elsewhere
38
+ index=None, # TODO(Siva): Abstract Index and add a type here and elsewhere
39
+ search_func: Callable | None = None,
53
40
  search_attr: str | None = None,
54
41
  output_attr: str | None = None,
55
- k: int | None = None, # TODO: disambiguate `k` to be something like `retrieve_k`
42
+ k: int | None = None, # TODO: disambiguate `k` to be something like `retrieve_k`
56
43
  limit: int | None = None,
57
44
  cardinality: Cardinality = Cardinality.ONE_TO_ONE,
58
45
  depends_on: list[str] | None = None,
@@ -67,6 +54,7 @@ class Set:
67
54
  self._group_by = group_by
68
55
  self._project_cols = None if project_cols is None else sorted(project_cols)
69
56
  self._index = index
57
+ self._search_func = search_func
70
58
  self._search_attr = search_attr
71
59
  self._output_attr = output_attr
72
60
  self._k = k
@@ -75,24 +63,22 @@ class Set:
75
63
  self._depends_on = [] if depends_on is None else sorted(depends_on)
76
64
  self._nocache = nocache
77
65
 
78
- def __str__(self):
79
- return (
80
- f"{self.__class__.__name__}(schema={self.schema}, desc={self._desc}, "
81
- f"filter={str(self._filter)}, udf={str(self._udf)}, agg_func={str(self._agg_func)}, limit={str(self._limit)}, "
82
- f"project_cols={str(self._project_cols)}, uid={self.universal_identifier()})"
83
- )
84
-
85
66
  @property
86
67
  def schema(self) -> Schema:
87
68
  return self._schema
88
69
 
70
+ def _set_data_source(self, source: DataReader):
71
+ if isinstance(self._source, Set):
72
+ self._source._set_data_source(source)
73
+ else:
74
+ self._source = source
75
+
89
76
  def serialize(self):
90
77
  # NOTE: I needed to remove depends_on from the serialization dictionary because
91
78
  # the optimizer changes the name of the depends_on fields to be their "full" name.
92
79
  # This created an issue with the node.universal_identifier() not being consistent
93
80
  # after changing the field to its full name.
94
81
  d = {
95
- "version": Set.SET_VERSION,
96
82
  "schema": self.schema.json_schema(),
97
83
  "source": self._source.serialize(),
98
84
  "desc": repr(self._desc),
@@ -104,6 +90,7 @@ class Set:
104
90
  "group_by": (None if self._group_by is None else self._group_by.serialize()),
105
91
  "project_cols": (None if self._project_cols is None else self._project_cols),
106
92
  "index": None if self._index is None else get_index_str(self._index),
93
+ "search_func": None if self._search_func is None else str(self._search_func),
107
94
  "search_attr": self._search_attr,
108
95
  "output_attr": self._output_attr,
109
96
  "k": self._k,
@@ -113,72 +100,56 @@ class Set:
113
100
 
114
101
  def universal_identifier(self):
115
102
  """Return a unique identifier for this Set."""
116
- d = self.serialize()
117
- ordered = json.dumps(d, sort_keys=True)
118
- result = hash_for_id(ordered)
119
- return result
103
+ return hash_for_serialized_dict(self.serialize())
120
104
 
121
105
  def json_schema(self):
122
106
  """Return the JSON schema for this Set."""
123
107
  return self.schema.json_schema()
124
108
 
125
109
 
126
-
127
110
  class Dataset(Set):
128
111
  """
129
- A Dataset is the intended abstraction for programmers to interact with when manipulating Sets.
130
-
131
- Users instantiate a Dataset by specifying a `source` that either points to a
132
- DataSource or an existing cached Set. Users can then perform computations on
133
- the Dataset in an imperative fashion by leveraging functions such as `filter`,
134
- `convert`, `aggregate`, etc. Underneath the hood, each of these operations creates
135
- a new Set which is cached by the DataManager. As a result, the Sets define the
136
- lineage of computation on a Dataset, and this enables programmers to re-use
137
- previously cached computation by providing it as a `source` to some future Dataset.
112
+ A Dataset is the intended abstraction for programmers to interact with when writing PZ programs.
113
+
114
+ Users instantiate a Dataset by specifying a `source` that either points to a DataReader
115
+ or an existing Dataset. Users can then perform computations on the Dataset in a lazy fashion
116
+ by leveraging functions such as `filter`, `sem_filter`, `sem_add_columns`, `aggregate`, etc.
117
+ Underneath the hood, each of these operations creates a new Dataset. As a result, the Dataset
118
+ defines a lineage of computation.
138
119
  """
139
120
 
140
- def __init__(self, source: str | list | pd.DataFrame | DataSource, schema: Schema | None = None, *args, **kwargs):
141
- # convert source (str) -> source (DataSource) if need be
142
- updated_source = DataDirectory().get_or_register_dataset(source) if isinstance(source, (str, list, pd.DataFrame)) else source
143
- if schema is None:
144
- schema = Schema.from_df(source) if isinstance(source, pd.DataFrame) else DefaultSchema
121
+ def __init__(
122
+ self,
123
+ source: str | Path | list | pd.DataFrame | DataReader | Dataset,
124
+ schema: Schema | None = None,
125
+ *args,
126
+ **kwargs,
127
+ ) -> None:
128
+ # NOTE: this function currently assumes that DataReader will always be provided with a schema;
129
+ # we will relax this assumption in a subsequent PR
130
+ # convert source into a DataReader
131
+ updated_source = get_local_datareader(source, **kwargs) if isinstance(source, (str, Path, list, pd.DataFrame)) else source
132
+
133
+ # get the schema
134
+ schema = updated_source.schema if schema is None else schema
135
+
145
136
  # intialize class
146
137
  super().__init__(updated_source, schema, *args, **kwargs)
147
138
 
148
- def copy(self) -> Dataset:
149
- source_copy = self._source.copy()
150
- dataset_copy = Dataset(
151
- schema=self.schema,
152
- source=source_copy,
153
- desc=self._desc,
154
- filter=self._filter,
155
- udf=self._udf,
156
- agg_func=self._agg_func,
157
- group_by=self._group_by,
158
- index=self._index,
159
- search_attr=self._search_attr,
160
- output_attr=self._output_attr,
161
- k=self._k,
162
- limit=self._limit,
163
- cardinality=self._cardinality,
164
- depends_on=self._depends_on,
165
- nocache=self._nocache,
166
- )
167
- return dataset_copy
168
-
169
139
  def filter(
170
140
  self,
171
- _filter: str | Callable,
141
+ _filter: Callable,
172
142
  depends_on: str | list[str] | None = None,
173
143
  ) -> Dataset:
174
- """Add a filter to the Set. This filter will possibly restrict the items that are returned later."""
144
+ """Add a user defined function as a filter to the Set. This filter will possibly restrict the items that are returned later."""
175
145
  f = None
176
- if isinstance(_filter, str):
177
- f = Filter(_filter)
178
- elif callable(_filter):
146
+ if callable(_filter):
179
147
  f = Filter(filter_fn=_filter)
180
148
  else:
181
- raise Exception("Filter type not supported.", type(_filter))
149
+ error_str = f"Only support callable for filter, currently got {type(_filter)}"
150
+ if isinstance(_filter, str):
151
+ error_str += ". Consider using sem_filter() for semantic filters."
152
+ raise Exception(error_str)
182
153
 
183
154
  if isinstance(depends_on, str):
184
155
  depends_on = [depends_on]
@@ -190,33 +161,115 @@ class Dataset(Set):
190
161
  depends_on=depends_on,
191
162
  nocache=self._nocache,
192
163
  )
193
-
194
- def convert(
164
+
165
+ def sem_filter(
195
166
  self,
196
- output_schema: Schema,
197
- udf: Callable | None = None,
198
- cardinality: Cardinality = Cardinality.ONE_TO_ONE,
167
+ _filter: str,
199
168
  depends_on: str | list[str] | None = None,
200
- desc: str = "Convert to new schema",
201
169
  ) -> Dataset:
202
- """Convert the Set to a new schema."""
170
+ """Add a natural language description of a filter to the Set. This filter will possibly restrict the items that are returned later."""
171
+ f = None
172
+ if isinstance(_filter, str):
173
+ f = Filter(_filter)
174
+ else:
175
+ raise Exception("sem_filter() only supports `str` input for _filter.", type(_filter))
176
+
203
177
  if isinstance(depends_on, str):
204
178
  depends_on = [depends_on]
205
179
 
206
180
  return Dataset(
207
181
  source=self,
208
- schema=output_schema,
209
- udf=udf,
182
+ schema=self.schema,
183
+ filter=f,
184
+ depends_on=depends_on,
185
+ nocache=self._nocache,
186
+ )
187
+
188
+ def sem_add_columns(self, cols: list[dict] | type[Schema],
189
+ cardinality: Cardinality = Cardinality.ONE_TO_ONE,
190
+ depends_on: str | list[str] | None = None,
191
+ desc: str = "Add new columns via semantic reasoning") -> Dataset:
192
+ """
193
+ Add new columns by specifying the column names, descriptions, and types.
194
+ The column will be computed during the execution of the Dataset.
195
+ Example:
196
+ sem_add_columns(
197
+ [{'name': 'greeting', 'desc': 'The greeting message', 'type': str},
198
+ {'name': 'age', 'desc': 'The age of the person', 'type': int},
199
+ {'name': 'full_name', 'desc': 'The name of the person', 'type': str}]
200
+ )
201
+ """
202
+ if isinstance(depends_on, str):
203
+ depends_on = [depends_on]
204
+
205
+ new_output_schema = None
206
+ if isinstance(cols, list):
207
+ new_output_schema = self.schema.add_fields(cols)
208
+ elif issubclass(cols, Schema):
209
+ new_output_schema = self.schema.union(cols)
210
+ else:
211
+ raise ValueError("`cols` must be a list of dictionaries or a Schema.")
212
+
213
+ return Dataset(
214
+ source=self,
215
+ schema=new_output_schema,
216
+ udf=None,
210
217
  cardinality=cardinality,
211
218
  depends_on=depends_on,
212
219
  desc=desc,
213
220
  nocache=self._nocache,
214
221
  )
215
-
216
- # This is a convenience for users who like DataFrames-like syntax.
217
- def add_columns(self, columns:dict[str, str], cardinality: Cardinality = Cardinality.ONE_TO_ONE) -> Dataset:
218
- new_output_schema = self.schema.add_fields(columns)
219
- return self.convert(new_output_schema, udf=None, cardinality=cardinality, depends_on=None, desc="Add columns " + str(columns))
222
+
223
+ def add_columns(self, udf: Callable,
224
+ cols: list[dict] | type[Schema],
225
+ cardinality: Cardinality = Cardinality.ONE_TO_ONE,
226
+ depends_on: str | list[str] | None = None,
227
+ desc: str = "Add new columns via UDF") -> Dataset:
228
+ """
229
+ Add new columns by specifying UDFs.
230
+
231
+ Examples:
232
+ add_columns(
233
+ udf=compute_personal_greeting,
234
+ cols=[
235
+ {'name': 'greeting', 'desc': 'The greeting message', 'type': str},
236
+ {'name': 'age', 'desc': 'The age of the person', 'type': int},
237
+ {'name': 'full_name', 'desc': 'The name of the person', 'type': str},
238
+ ]
239
+ )
240
+ """
241
+ if udf is None or cols is None:
242
+ raise ValueError("`udf` and `cols` must be provided for add_columns.")
243
+
244
+ if isinstance(depends_on, str):
245
+ depends_on = [depends_on]
246
+
247
+ new_output_schema = None
248
+ if isinstance(cols, list):
249
+ updated_cols = []
250
+ for col_dict in cols:
251
+ assert isinstance(col_dict, dict), "each entry in `cols` must be a dictionary"
252
+ assert "name" in col_dict, "each type must contain a 'name' key specifying the column name"
253
+ assert "type" in col_dict, "each type must contain a 'type' key specifying the column type"
254
+ col_dict["desc"] = col_dict.get("desc", "New column: " + col_dict["name"])
255
+ updated_cols.append(col_dict)
256
+ new_output_schema = self.schema.add_fields(updated_cols)
257
+
258
+ elif issubclass(cols, Schema):
259
+ new_output_schema = self.schema.union(cols)
260
+
261
+ else:
262
+ raise ValueError("`cols` must be a list of dictionaries or a Schema.")
263
+
264
+ return Dataset(
265
+ source=self,
266
+ schema=new_output_schema,
267
+ udf=udf,
268
+ cardinality=cardinality,
269
+ desc=desc,
270
+ depends_on=depends_on,
271
+ nocache=self._nocache,
272
+ )
220
273
 
221
274
  def count(self) -> Dataset:
222
275
  """Apply a count aggregation to this set"""
@@ -247,12 +300,27 @@ class Dataset(Set):
247
300
  nocache=self._nocache,
248
301
  )
249
302
 
250
- def retrieve(self, output_schema, index, search_attr, output_attr, k=-1) -> Dataset:
303
+ def retrieve(
304
+ self, index, search_func: Callable, search_attr: str, output_attr: str, output_attr_desc: str, k=-1
305
+ ) -> Dataset:
306
+ """
307
+ Retrieve the top k nearest neighbors of the value of the `search_attr` from the index and
308
+ stores it in the `output_attr` field. The output schema is a union of the current schema
309
+ and the `output_attr` with type ListField(StringField). `search_func` is a function of
310
+ type (index, query: str | list(str), k: int) -> list[str]. It should implement the lookup
311
+ logic for the index and return the top k results. The value of the `search_attr` field is
312
+ used as the query to lookup in the index. The results are stored in the `output_attr`
313
+ field. `output_attr_desc` is the description of the `output_attr` field.
314
+ """
315
+ # Output schema is a union of the current schema and the output_attr
316
+ attributes = {output_attr: ListField(StringField)(desc=output_attr_desc)}
317
+ output_schema = self.schema().union(type("Schema", (Schema,), attributes))
251
318
  return Dataset(
252
319
  source=self,
253
320
  schema=output_schema,
254
321
  desc="Retrieve",
255
322
  index=index,
323
+ search_func=search_func,
256
324
  search_attr=search_attr,
257
325
  output_attr=output_attr,
258
326
  k=k,
@@ -278,6 +346,14 @@ class Dataset(Set):
278
346
  nocache=self._nocache,
279
347
  )
280
348
 
281
- def run(self, config: QueryProcessorConfig | None = None, **kwargs): # noqa: F821
349
+ def run(self, config: QueryProcessorConfig | None = None, **kwargs):
350
+ """Invoke the QueryProcessor to execute the query. `kwargs` will be applied to the QueryProcessorConfig."""
351
+ # TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR
282
352
  from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory
353
+
354
+ # as syntactic sugar, we will allow some keyword arguments to parameterize our policies
355
+ policy = construct_policy_from_kwargs(**kwargs)
356
+ if policy is not None:
357
+ kwargs["policy"] = policy
358
+
283
359
  return QueryProcessorFactory.create_and_run_processor(self, config, **kwargs)
@@ -3,7 +3,7 @@ import io
3
3
  import json
4
4
  import os
5
5
  import time
6
- from typing import BinaryIO, List
6
+ from typing import BinaryIO
7
7
  from zipfile import ZipFile
8
8
 
9
9
  import pandas as pd
@@ -11,32 +11,9 @@ import requests
11
11
  from fastapi import status
12
12
  from pypdf import PdfReader
13
13
 
14
- from palimpzest.config import Config
15
-
16
14
  COSMOS_ADDRESS = "https://xdd.wisc.edu/cosmos_service"
17
15
 
18
16
 
19
- class PdfParser:
20
- def __init__(self, pdf_path: str):
21
- self.pdf_path = pdf_path
22
- with open(pdf_path, "rb") as f:
23
- self.pdf = f.read()
24
- self.text = ""
25
- self.pages = []
26
- self._parse()
27
-
28
- def _parse(self):
29
- for page in self.pdf:
30
- self.text += page.get_text() # type: ignore
31
- self.pages.append(page.get_text()) # type: ignore
32
-
33
- def get_text(self) -> str:
34
- return self.text
35
-
36
- def get_pages(self) -> List[str]:
37
- return self.pages
38
-
39
-
40
17
  def get_md5(file_bytes: bytes) -> str:
41
18
  if not isinstance(file_bytes, bytes):
42
19
  file_bytes = file_bytes.encode()
@@ -209,15 +186,9 @@ def cosmos_client(name: str, data: BinaryIO, output_dir: str, delay=10):
209
186
  # 1. Check if the text file already exists in the cache, if so, read from the cache
210
187
  # 2. If not, call the cosmos_client function to process the PDF file and cache the text file
211
188
  ##
212
- # NOTE: I don't believe anyone actively depends on this function, but we need to remove the
213
- # dependency on DataDirectory() in order to prevent circular imports. The long-term solution
214
- # is to separate out the pieces of DataDirectory which the DataSources depend on, from the
215
- # pieces which are related to setting / reading external configurations (like "pdfprocessor").
216
- # However, given that I can fix this in two minutes by adding this is a kwarg, I'm going to
217
- # do that for now and revisit the issue if/when this matters.
218
189
 
219
190
  # TODO(Jun): 1. cosmos returns 202 for me. 2. why only accept "pypdf" and "cosmos" as pdfprocessor?
220
- def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="cosmos", enable_file_cache=True, file_cache_dir="/tmp"):
191
+ def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="pypdf", enable_file_cache=True, file_cache_dir="/tmp"):
221
192
  pdf_filename = filename
222
193
  file_name = os.path.basename(pdf_filename)
223
194
  file_name_without_extension = os.path.splitext(file_name)[0]
@@ -229,11 +200,12 @@ def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="cosmos", enable_file_ca
229
200
  for page in pdf.pages:
230
201
  all_text += page.extract_text() + "\n"
231
202
  return all_text
232
- # return pdf.pages[0].extract_text() # TODO we can only return first page
203
+
233
204
  else:
234
205
  # Get md5 of the pdf_bytes
235
206
  md5 = get_md5(pdf_bytes)
236
207
  cached_extraction_folder = f"COSMOS_{os.path.splitext(file_name)[0].replace(' ', '_')}_{md5}"
208
+
237
209
  # Check if pz_file_cache_dir exists in the file system
238
210
  pz_file_cache_dir = os.path.join(file_cache_dir, cached_extraction_folder)
239
211
  if enable_file_cache and os.path.exists(pz_file_cache_dir):
@@ -243,43 +215,12 @@ def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="cosmos", enable_file_ca
243
215
  text_content = file.read()
244
216
  return text_content
245
217
 
246
- #
247
- # CHUNWEI: This code has a bug
248
- # It checks to see if the text file name is in the registry, but there are two things wrong here.
249
- # 1) The registry is for 'official' datasets that have been inserted by the user, not cached objects.
250
- # 2) The filename isn't enough to check for cached results. Maybe the file moved directories, or maybe there are
251
- # multiple different files with the same name. You need the checksum of the original file to ensure the cached
252
- # object is valid.
253
- #
254
- # if DataDirectory().exists(text_file_name):
255
- # print(f"Text file {text_file_name} already exists, reading from cache")
256
- # text_file_path = DataDirectory().get_path(text_file_name)
257
- # with open(text_file_path, 'r') as file:
258
- # text_content = file.read()
259
- # return text_content
260
- # cosmos_file_dir = file_name_without_extension.replace(" ", "_")
261
- # get a tmp of the system temp directory
262
-
263
- print(f"Processing {file_name} through COSMOS")
264
218
  # Call the cosmos_client function
219
+ print(f"Processing {file_name} through COSMOS")
265
220
  cosmos_client(file_name, pdf_bytes, file_cache_dir)
266
221
  text_file_path = os.path.join(pz_file_cache_dir, text_file_name)
267
222
  if not os.path.exists(text_file_path):
268
223
  raise FileNotFoundError(f"Text file {text_file_name} not found in {pz_file_cache_dir}/{text_file_name}")
269
- # DataDirectory().register_local_file(text_file_path, text_file_name)
270
224
  with open(text_file_path) as file:
271
225
  text_content = file.read()
272
226
  return text_content
273
-
274
-
275
- if __name__ == "__main__":
276
- config = Config("default")
277
- file_path = "../../../testdata/pdfs-tiny/battery.pdf"
278
- # output_dir = "../../../tests/testFileDirectory/cosmos"
279
- with open(file_path, "rb") as file:
280
- text = get_text_from_pdf(file_path, file.read())
281
- print(text)
282
- # file_name = os.path.basename(file_path)
283
- # # Call the cosmos_client function
284
- # cosmos_client(file_name, file, output_dir)
285
- # DataDirectory().rm_registered_dataset("sidarthe.annotations.txt")
@@ -0,0 +1,61 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+
6
+ from palimpzest import constants
7
+ from palimpzest.core.data.datareaders import (
8
+ DataReader,
9
+ FileReader,
10
+ HTMLFileDirectoryReader,
11
+ ImageFileDirectoryReader,
12
+ MemoryReader,
13
+ PDFFileDirectoryReader,
14
+ TextFileDirectoryReader,
15
+ XLSFileDirectoryReader,
16
+ )
17
+
18
+
19
+ def get_local_source(path: str | Path, **kwargs) -> DataReader:
20
+ """Return a DataReader for a local file or directory."""
21
+ if os.path.isfile(path):
22
+ return FileReader(path)
23
+
24
+ elif os.path.isdir(path):
25
+ if all([f.endswith(tuple(constants.IMAGE_EXTENSIONS)) for f in os.listdir(path)]):
26
+ return ImageFileDirectoryReader(path)
27
+
28
+ elif all([f.endswith(tuple(constants.PDF_EXTENSIONS)) for f in os.listdir(path)]):
29
+ pdfprocessor = kwargs.get("pdfprocessor", constants.DEFAULT_PDF_PROCESSOR)
30
+ file_cache_dir = kwargs.get("file_cache_dir", "/tmp")
31
+ return PDFFileDirectoryReader(
32
+ path=path, pdfprocessor=pdfprocessor, file_cache_dir=file_cache_dir
33
+ )
34
+
35
+ elif all([f.endswith(tuple(constants.XLS_EXTENSIONS)) for f in os.listdir(path)]):
36
+ return XLSFileDirectoryReader(path)
37
+
38
+ elif all([f.endswith(tuple(constants.HTML_EXTENSIONS)) for f in os.listdir(path)]):
39
+ return HTMLFileDirectoryReader(path)
40
+
41
+ else:
42
+ return TextFileDirectoryReader(path)
43
+ else:
44
+ raise Exception(f"Path {path} is invalid. Does not point to a file or directory.")
45
+
46
+
47
+ def get_local_datareader(source: str | Path | list | pd.DataFrame, **kwargs) -> DataReader:
48
+ """
49
+ This helper function returns a `DataReader` object based on the `source` type.
50
+ The returned `DataReader` object is guaranteed to have a schema.
51
+ """
52
+ if isinstance(source, (str, Path)):
53
+ source = get_local_source(source, **kwargs)
54
+
55
+ elif isinstance(source, (list, pd.DataFrame)):
56
+ source = MemoryReader(source)
57
+
58
+ else:
59
+ raise Exception(f"Invalid source type: {type(source)}, We only support str, Path, list[dict], and pd.DataFrame")
60
+
61
+ return source
@@ -0,0 +1,69 @@
1
+ import types
2
+
3
+ from palimpzest.core.lib.fields import (
4
+ BooleanField,
5
+ BytesField,
6
+ Field,
7
+ FloatField,
8
+ IntField,
9
+ ListField,
10
+ NumericField,
11
+ StringField,
12
+ )
13
+
14
+
15
+ def assert_valid_field_type(field_type: type | types.UnionType | types.GenericAlias | Field) -> str:
16
+ """
17
+ Assert that the field is a valid field type. Return "pz_type" if field_type is a PZ type
18
+ and "python_type" if it is a Python type.
19
+ """
20
+ try:
21
+ assert issubclass(field_type, Field), "type must be a Python type or palimpzest.core.lib.fields.Field"
22
+ return "pz_type"
23
+ except Exception:
24
+ assert isinstance(field_type, (type, types.UnionType, types.GenericAlias)), "type must be a Python type or palimpzest.core.lib.fields.Field"
25
+
26
+ return "python_type"
27
+
28
+
29
+ def construct_field_type(field_type: type | types.UnionType | types.GenericAlias | Field, desc: str) -> Field:
30
+ """Convert a field type and description to the corresponding PZ field.
31
+
32
+ Args:
33
+ type: type for the field (e.g. str, bool, list[int], StringField, etc.)
34
+ desc: description used in the field constructor
35
+
36
+ Returns:
37
+ Corresponding Field class
38
+
39
+ Raises:
40
+ ValueError: If the type is not recognized
41
+ """
42
+ # if field_type is a PZ type, construct and return the field
43
+ if assert_valid_field_type(field_type) == "pz_type":
44
+ return field_type(desc=desc)
45
+
46
+ # otherwise, map the Python type to a PZ type and construct the field
47
+ supported_types_map = {
48
+ str: StringField,
49
+ bool: BooleanField,
50
+ int: IntField,
51
+ float: FloatField,
52
+ int | float: NumericField,
53
+ bytes: BytesField,
54
+ list[str]: ListField(StringField),
55
+ list[bool]: ListField(BooleanField),
56
+ list[int]: ListField(IntField),
57
+ list[float]: ListField(FloatField),
58
+ list[int | float]: ListField(NumericField),
59
+ list[bytes]: ListField(BytesField),
60
+ }
61
+
62
+ if field_type not in supported_types_map:
63
+ raise ValueError(f"Unsupported type: {field_type}. Supported types are: {list(supported_types_map.keys())}")
64
+
65
+ # get the field class and (if applicable) element field class
66
+ field_cls = supported_types_map[field_type]
67
+
68
+ # construct and return the field
69
+ return field_cls(desc=desc)
@@ -1,4 +1,5 @@
1
1
  import hashlib
2
+ import json
2
3
 
3
4
  from palimpzest.constants import MAX_ID_CHARS
4
5
 
@@ -7,5 +8,5 @@ def hash_for_id(id_str: str, max_chars: int = MAX_ID_CHARS) -> str:
7
8
  return hashlib.sha256(id_str.encode("utf-8")).hexdigest()[:max_chars]
8
9
 
9
10
 
10
- def hash_for_temp_schema(id_str:str) ->str:
11
- return hash_for_id(id_str)
11
+ def hash_for_serialized_dict(dict_obj: dict) -> str:
12
+ return hash_for_id(json.dumps(dict_obj, sort_keys=True))