PyPI - palimpzest - Versions diffs - 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

palimpzest 0.7.21py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +343 -209
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +639 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +62 -6
palimpzest/prompts/filter_prompts.py +51 -6
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +6 -6
palimpzest/prompts/prompt_factory.py +375 -47
palimpzest/prompts/split_proposer_prompts.py +1 -1
palimpzest/prompts/util_phrases.py +5 -0
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +160 -331
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +33 -19
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +26 -16
palimpzest/query/operators/join.py +403 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +205 -77
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +42 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +32 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/METADATA +6 -1
palimpzest-0.8.1.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.21.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/WHEEL +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.21.dist-info → palimpzest-0.8.1.dist-info}/top_level.txt +0 -0

palimpzest/core/data/{datareaders.py → iter_dataset.py} RENAMED Viewed

@@ -4,69 +4,62 @@ import base64
 import os
 from abc import ABC, abstractmethod
 from io import BytesIO
+from pathlib import Path
 import pandas as pd
 from bs4 import BeautifulSoup
+from pydantic import BaseModel
 from palimpzest import constants
+from palimpzest.core.data import dataset
 from palimpzest.core.lib.schemas import (
+    AudioFile,
     DefaultSchema,
-    File,
     ImageFile,
     PDFFile,
-    Schema,
     TextFile,
     WebPage,
     XLSFile,
+    create_schema_from_df,
+    create_schema_from_fields,
 )
+from palimpzest.query.operators.logical import BaseScan
 from palimpzest.tools.pdfparser import get_text_from_pdf
-# First level of abstraction
-class DataReader(ABC):
+####################
+### BASE CLASSES ###
+####################
+class IterDataset(dataset.Dataset, ABC):
     """
-    The `DataReader` is a base class for which may be used to generate data that
-    is processed by PZ.
+    The `IterDataset` is an abstract base class for root `Datasets` whose data is accessed
+    via iteration. Classes which inherit from this class must implement two methods:
-    Subclasses of the (abstract) `DataReader` class must implement two methods:
-    - `__len__()`: which returns the number of elements in the data source
+    - `__len__()`: which returns the number of elements in the dataset
     - `__getitem__(idx: int)`: which takes in an `idx` and returns the element at that index
     """
-    def __init__(self, schema: type[Schema] | list[dict]) -> None:
+    def __init__(self, id: str, schema: type[BaseModel] | list[dict]) -> None:
         """
-            Constructor for the `DataReader` class.
+            Constructor for the `IterDataset` class.
             Args:
-                schema (Schema | list[dict]): The output schema of the records returned by the DataReader
+                id (str): a string identifier for the `Dataset`
+                schema (BaseModel | list[dict]): The output schema of the records returned by the `Dataset`
         """
-        # NOTE: _schema attribute currently has to match attribute name in Dataset
-        self._schema = Schema.from_json(schema) if isinstance(schema, list) else schema
-    def __eq__(self, __value: object) -> bool:
-        return self.__dict__ == __value.__dict__
-    def __str__(self) -> str:
-        return f"{self.__class__.__name__}(schema={self.schema})"
-    @property
-    def schema(self) -> Schema:
-        return self._schema
-    # NOTE: currently used by optimizer to compute node id for DataReaders
-    def serialize(self) -> dict:
-        return {"schema": self._schema.json_schema()}
+        # compute Schema and call parent constructor
+        schema = create_schema_from_fields(schema) if isinstance(schema, list) else schema
+        super().__init__(sources=None, operator=BaseScan(datasource=self, output_schema=schema), schema=schema, id=id)
     @abstractmethod
     def __len__(self) -> int:
-        """Returns the number of items in the data reader."""
+        """Returns the number of items in the `Dataset`."""
         pass
     @abstractmethod
     def __getitem__(self, idx: int) -> dict:
         """
-        Returns a single item from the data reader at the given index.
+        Returns a single item from the `Dataset` at the given index.
         Args:
             idx (int): The index of the item to return
@@ -74,7 +67,7 @@ class DataReader(ABC):
         Returns:
             dict: A dictionary representing the item at the given index. The dictionary
                   keys (i.e. fields) should match the fields specified in the schema of the
-                  data source, and the values should be the values associated with those fields.
+                  dataset, and the values should be the values associated with those fields.
                     # Example return value
                     {"field1": value1, "field2": value2, ...}
@@ -83,111 +76,103 @@ class DataReader(ABC):
         pass
-# Second level of abstraction
-class DirectoryReader(DataReader):
+class BaseFileDataset(IterDataset):
     """
-    DirectoryReader returns a dictionary for each file in a directory. Each dictionary contains the filename and
-    contents of a single file in the directory.
+    BaseFileDataset is the base class for multiple `IterDatasets` which iterate over
+    different types of files.
     """
-    def __init__(self, path: str, schema: Schema) -> None:
+    def __init__(self, path: str, **kwargs) -> None:
         """
-        Constructor for the `DirectoryReader` class.
+        Constructor for the `BaseFileDataset` class.
         Args:
-            path (str): The path to the directory
-            schema (Schema): The output schema of the data source
+            path (str): The path to the file
+            kwargs (dict): Keyword arguments containing the `Dataset's` id and file-specific `Schema`
         """
-        assert os.path.isdir(path), f"Path {path} is not a directory"
+        # check that path is a valid file or directory
+        assert os.path.isfile(path) or os.path.isdir(path), f"Path {path} is not a file nor a directory"
-        self.filepaths = [
-            os.path.join(path, filename)
-            for filename in sorted(os.listdir(path))
-            if os.path.isfile(os.path.join(path, filename))
-        ]
-        self.path = path
-        super().__init__(schema)
+        # get list of filepaths
+        self.filepaths = []
+        if os.path.isfile(path):
+            self.filepaths = [path]
+        else:
+            self.filepaths = [
+                os.path.join(path, filename)
+                for filename in sorted(os.listdir(path))
+                if os.path.isfile(os.path.join(path, filename))
+            ]
-    def serialize(self) -> dict:
-        return {
-            "schema": self.schema.json_schema(),
-            "path": self.path,
-            "source_type": "directory",
-        }
+        # call parent constructor to set id, operator, and schema
+        super().__init__(**kwargs)
     def __len__(self) -> int:
         return len(self.filepaths)
-class FileReader(DataReader):
-    """FileReader returns a single dictionary with the filename and contents of a local file (in bytes)."""
+class BaseFileDirectoryDataset(IterDataset):
+    """
+    BaseFileDirectoryDataset is the base class for multiple `IterDatasets` which iterate over
+    different types of files. This class walks the entire directory tree rooted at `path`.
+    """
-    def __init__(self, path: str) -> None:
+    def __init__(self, path: str, **kwargs) -> None:
         """
-        Constructor for the `FileReader` class. The `schema` is set to the default `File` schema.
+        Constructor for the `BaseFileDataset` class.
         Args:
             path (str): The path to the file
-        """
-        super().__init__(File)
-        self.filepath = path
-    def serialize(self) -> dict:
-        return {
-            "schema": self.schema.json_schema(),
-            "path": self.filepath,
-            "source_type": "file",
-        }
+            kwargs (dict): Keyword arguments containing the `Dataset's` id and file-specific `Schema`
+        """
+        # check that path is a valid file or directory
+        assert os.path.isfile(path) or os.path.isdir(path), f"Path {path} is not a file nor a directory"
+        # get list of filepaths
+        self.filepaths = []
+        if os.path.isfile(path):
+            self.filepaths = [path]
+        else:
+            self.filepaths = []
+            for root, _, files in os.walk(path):
+                for file in files:
+                    fp = os.path.join(root, file)
+                    self.filepaths.append(fp)
+            self.filepaths = sorted(self.filepaths)
+        # call parent constructor to set id, operator, and schema
+        super().__init__(**kwargs)
     def __len__(self) -> int:
-        return 1
-    def __getitem__(self, idx: int) -> dict:
-        """
-        Returns a dictionary with the filename and contents of the file.
-        Args:
-            idx (int): The index of the item to return. This argument is ignored.
-        Returns:
-            dict: A dictionary with the filename and contents of the file.
-            .. code-block:: python
-                {
-                    "filename": "path/to/file.txt",
-                    "contents": b"file contents here",
-                }
-        """
-        filename = self.filepath
-        with open(self.filepath, "rb") as f:
-            contents = f.read()
-        return {"filename": filename, "contents": contents}
+        return len(self.filepaths)
-class MemoryReader(DataReader):
+########################
+### CONCRETE CLASSES ###
+########################
+class MemoryDataset(IterDataset):
     """
-    MemoryReader returns one or more dictionaries that reflect the contents of an in-memory Python object `vals`.
+    MemoryDataset returns one or more dictionaries that reflect the contents of an in-memory Python object `vals`.
     If `vals` is not a pd.DataFrame, then the dictionary returned by `__getitem__()` has a single field called "value".
     Otherwise, the dictionary contains the key-value mapping from columns to values for the `idx` row in the dataframe.
     TODO(gerardo): Add support for other types of in-memory data structures (he has some code for subclassing
-        MemoryReader on his branch)
+        MemoryDataset on his branch)
     """
-    def __init__(self, vals: list | pd.DataFrame) -> None:
+    def __init__(self, id: str, vals: list | pd.DataFrame, schema: type[BaseModel] | list[dict] | None = None) -> None:
         """
-        Constructor for the `MemoryReader` class. The `schema` is set to the default `DefaultSchema` schema.
+        Constructor for the `MemoryDataset` class. The `schema` is set to the default `DefaultSchema` schema.
         If `vals` is a pd.DataFrame, then the schema is set to the schema inferred from the DataFrame.
         Args:
-            vals (Any): The in-memory object to use as the data source
+            id (str): a string identifier for the `Dataset`
+            vals (Any): The in-memory data to iterate over
         """
         # if list[dict] --> convert to pd.DataFrame first
         self.vals = pd.DataFrame(vals) if isinstance(vals, list) and all([isinstance(item, dict) for item in vals]) else vals
-        schema = Schema.from_df(self.vals) if isinstance(self.vals, pd.DataFrame) else DefaultSchema
-        super().__init__(schema)
+        if schema is None:
+            schema = create_schema_from_df(self.vals) if isinstance(self.vals, pd.DataFrame) else DefaultSchema
+        super().__init__(id=id, schema=schema)
     def __len__(self) -> int:
         return len(self.vals)
@@ -228,20 +213,20 @@ class MemoryReader(DataReader):
         return item
-# Third level of abstraction
-class HTMLFileDirectoryReader(DirectoryReader):
+class HTMLFileDataset(BaseFileDataset):
     """
-    HTMLFileDirectoryReader returns a dictionary for each HTML file in a directory. Each dictionary contains the
+    HTMLFileDataset returns a dictionary for each HTML file in a directory. Each dictionary contains the
     filename, raw HTML content, and parsed content of a single HTML file in the directory.
     """
-    def __init__(self, path: str) -> None:
+    def __init__(self, id: str, path: str) -> None:
         """
-        Constructor for the `HTMLFileDirectoryReader` class. The `schema` is set to the `WebPage` schema.
+        Constructor for the `HTMLFileDataset` class. The `schema` is set to the `WebPage` schema.
         Args:
+            id (str): a string identifier for the `Dataset`
             path (str): The path to the directory
         """
-        super().__init__(path=path, schema=WebPage)
+        super().__init__(path=path, id=id, schema=WebPage)
         assert all([filename.endswith(tuple(constants.HTML_EXTENSIONS)) for filename in self.filepaths])
     def _html_to_text_with_links(self, html: str) -> str:
@@ -296,19 +281,20 @@ class HTMLFileDirectoryReader(DirectoryReader):
         return item
-class ImageFileDirectoryReader(DirectoryReader):
+class ImageFileDataset(BaseFileDataset):
     """
-    ImageFileDirectoryReader returns a dictionary for each image file in a directory. Each dictionary contains the
+    ImageFileDataset returns a dictionary for each image file in a directory. Each dictionary contains the
     filename and the base64 encoded bytes content of a single image file in the directory.
     """
-    def __init__(self, path: str) -> None:
+    def __init__(self, id: str, path: str) -> None:
         """
-        Constructor for the `ImageFileDirectoryReader` class. The `schema` is set to the `ImageFile` schema.
+        Constructor for the `ImageFileDataset` class. The `schema` is set to the `ImageFile` schema.
         Args:
+            id (str): a string identifier for the `Dataset`
             path (str): The path to the directory
         """
-        super().__init__(path=path, schema=ImageFile)
+        super().__init__(path=path, id=id, schema=ImageFile)
         assert all([filename.endswith(tuple(constants.IMAGE_EXTENSIONS)) for filename in self.filepaths])
     def __getitem__(self, idx: int) -> dict:
@@ -332,33 +318,35 @@ class ImageFileDirectoryReader(DirectoryReader):
         filepath = self.filepaths[idx]
         filename = os.path.basename(filepath)
         with open(filepath, "rb") as f:
-            contents = base64.b64encode(f.read())
+            contents = base64.b64encode(f.read()).decode("utf-8")
         return {"filename": filename, "contents": contents}
-class PDFFileDirectoryReader(DirectoryReader):
+class PDFFileDataset(BaseFileDataset):
     """
-    PDFFileDirectoryReader returns a dictionary for each PDF file in a directory. Each dictionary contains the
+    PDFFileDataset returns a dictionary for each PDF file in a directory. Each dictionary contains the
     filename, raw PDF content, and parsed text content of a single PDF file in the directory.
     This class also uses one of a predefined set of PDF processors to extract text content from the PDF files.
     """
     def __init__(
         self,
+        id: str,
         path: str,
         pdfprocessor: str = "pypdf",
         file_cache_dir: str = "/tmp",
     ) -> None:
         """
-        Constructor for the `PDFFileDirectoryReader` class. The `schema` is set to the `PDFFile` schema.
+        Constructor for the `PDFFileDataset` class. The `schema` is set to the `PDFFile` schema.
         Args:
+            id (str): a string identifier for the `Dataset`
             path (str): The path to the directory
             pdfprocessor (str): The PDF processor to use for extracting text content from the PDF files
             file_cache_dir (str): The directory to store the temporary files generated during PDF processing
         """
-        super().__init__(path=path, schema=PDFFile)
+        super().__init__(path=path, id=id, schema=PDFFile)
         assert all([filename.endswith(tuple(constants.PDF_EXTENSIONS)) for filename in self.filepaths])
         self.pdfprocessor = pdfprocessor
         self.file_cache_dir = file_cache_dir
@@ -394,19 +382,20 @@ class PDFFileDirectoryReader(DirectoryReader):
         return {"filename": pdf_filename, "contents": pdf_bytes, "text_contents": text_content}
-class TextFileDirectoryReader(DirectoryReader):
+class TextFileDataset(BaseFileDataset):
     """
-    TextFileDirectoryReader returns a dictionary for each text file in a directory. Each dictionary contains the
+    TextFileDataset returns a dictionary for each text file in a directory. Each dictionary contains the
     filename and contents of a single text file in the directory.
     """
-    def __init__(self, path: str) -> None:
+    def __init__(self, id: str, path: str) -> None:
         """
-        Constructor for the `TextFileDirectoryReader` class. The `schema` is set to the `TextFile` schema.
+        Constructor for the `TextFileDataset` class. The `schema` is set to the `TextFile` schema.
         Args:
+            id (str): a string identifier for the `Dataset`
             path (str): The path to the directory
         """
-        super().__init__(path=path, schema=TextFile)
+        super().__init__(path=path, id=id, schema=TextFile)
     def __getitem__(self, idx: int) -> dict:
         """
@@ -433,16 +422,16 @@ class TextFileDirectoryReader(DirectoryReader):
         return {"filename": filename, "contents": contents}
-class XLSFileDirectoryReader(DirectoryReader):
+class XLSFileDataset(BaseFileDataset):
     """
-    XLSFileDirectoryReader returns a dictionary for each XLS file in a directory. Each dictionary contains the
+    XLSFileDataset returns a dictionary for each XLS file in a directory. Each dictionary contains the
     filename, contents, sheet names, and the number of sheets for a single XLS file in the directory.
     """
-    def __init__(self, path: str) -> None:
+    def __init__(self, id: str, path: str) -> None:
         """
-        Constructor for the `XLSFileDirectoryReader` class. The `schema` is set to the `XLSFile` schema.
+        Constructor for the `XLSFileDataset` class. The `schema` is set to the `XLSFile` schema.
         """
-        super().__init__(path=path, schema=XLSFile)
+        super().__init__(path=path, id=id, schema=XLSFile)
         assert all([filename.endswith(tuple(constants.XLS_EXTENSIONS)) for filename in self.filepaths])
     def __getitem__(self, idx: int) -> dict:
@@ -478,3 +467,90 @@ class XLSFileDirectoryReader(DirectoryReader):
             "sheet_names": xls.sheet_names,
             "number_sheets": len(xls.sheet_names),
         }
+class AudioFileDataset(BaseFileDirectoryDataset):
+    """
+    AudioFileDataset returns a dictionary for each audio file in a directory. Each dictionary contains the
+    filename and the base64 encoded bytes content of a single audio file in the directory.
+    """
+    def __init__(self, id: str, path: str) -> None:
+        """
+        Constructor for the `AudioFileDataset` class. The `schema` is set to the `AudioFile` schema.
+        Args:
+            id (str): a string identifier for the `Dataset`
+            path (str): The path to the directory
+        """
+        super().__init__(path=path, id=id, schema=AudioFile)
+        assert all([filename.endswith(tuple(constants.AUDIO_EXTENSIONS)) for filename in self.filepaths])
+    def __getitem__(self, idx: int) -> dict:
+        """
+        Returns a dictionary with the filename and base64 encoded bytes content of the audio file at the
+        specified `idx`.
+        Args:
+            idx (int): The index of the item to return
+        Returns:
+            dict: A dictionary with the filename and base64 encoded bytes content of the audio file.
+            .. code-block:: python
+                {
+                    "filename": "audio.wav",
+                    "contents": b"base64 encoded audio content here",
+                }
+        """
+        filepath = self.filepaths[idx]
+        filename = os.path.basename(filepath)
+        with open(filepath, "rb") as f:
+            contents = base64.b64encode(f.read()).decode("utf-8")
+        return {"filename": filename, "contents": contents}
+def get_local_source(id: str, path: str | Path, **kwargs) -> dataset.Dataset:
+    """Return a `Dataset` for a local file or directory."""
+    if os.path.isfile(path):
+        return TextFileDataset(id, path)
+    elif os.path.isdir(path):
+        if all([f.endswith(tuple(constants.IMAGE_EXTENSIONS)) for f in os.listdir(path)]):
+            return ImageFileDataset(id, path)
+        elif all([f.endswith(tuple(constants.PDF_EXTENSIONS)) for f in os.listdir(path)]):
+            pdfprocessor = kwargs.get("pdfprocessor", constants.DEFAULT_PDF_PROCESSOR)
+            file_cache_dir = kwargs.get("file_cache_dir", "/tmp")
+            return PDFFileDataset(
+                id=id, path=path, pdfprocessor=pdfprocessor, file_cache_dir=file_cache_dir
+            )
+        elif all([f.endswith(tuple(constants.XLS_EXTENSIONS)) for f in os.listdir(path)]):
+            return XLSFileDataset(id, path)
+        elif all([f.endswith(tuple(constants.HTML_EXTENSIONS)) for f in os.listdir(path)]):
+            return HTMLFileDataset(id, path)
+        else:
+            return TextFileDataset(id, path)
+    else:
+        raise ValueError(f"Path {path} is invalid. Does not point to a file or directory.")
+def resolve_datasource(id: str, source: str | Path | list | pd.DataFrame, **kwargs) -> dataset.Dataset:
+    """
+    This helper function returns a `Dataset` object based on the `source` type.
+    The returned `Dataset` object is guaranteed to have a schema.
+    """
+    if isinstance(source, (str, Path)):
+        source = get_local_source(id, source, **kwargs)
+    elif isinstance(source, (list, pd.DataFrame)):
+        source = MemoryDataset(id=id, vals=source)
+    else:
+        raise ValueError(f"Invalid source type: {type(source)}, We only support str, Path, list[dict], and pd.DataFrame")
+    return source

palimpzest/core/elements/groupbysig.py CHANGED Viewed

@@ -2,10 +2,12 @@ from __future__ import annotations
 from typing import Any
-from palimpzest.core.lib.fields import Field
-from palimpzest.core.lib.schemas import OperatorDerivedSchema, Schema
+from pydantic import BaseModel
+from palimpzest.core.lib.schemas import create_schema_from_fields
+# TODO: need to rethink how group bys work
 # signature for a group by aggregate that applies
 # group and aggregation to an input tuple
 class GroupBySig:
@@ -14,12 +16,12 @@ class GroupBySig:
         self.agg_funcs = agg_funcs
         self.agg_fields = agg_fields
-    def validate_schema(self, input_schema: Schema) -> tuple[bool, str | None]:
+    def validate_schema(self, input_schema: BaseModel) -> tuple[bool, str | None]:
         for f in self.group_by_fields:
-            if not hasattr(input_schema, f):
+            if f not in input_schema.model_fields:
                 return (False, "Supplied schema has no field " + f)
         for f in self.agg_fields:
-            if not hasattr(input_schema, f):
+            if f not in input_schema.model_fields:
                 return (False, "Supplied schema has no field " + f)
         return (True, None)
@@ -48,16 +50,17 @@ class GroupBySig:
             ops.append(self.agg_funcs[i] + "(" + self.agg_fields[i] + ")")
         return ops
-    def output_schema(self) -> type[OperatorDerivedSchema]:
+    def output_schema(self) -> type[BaseModel]:
         # the output class varies depending on the group by, so here
         # we dynamically construct this output
-        schema = type("CustomGroupBy", (OperatorDerivedSchema,), {})
+        fields = []
         for g in self.group_by_fields:
-            f = Field(desc=g)
-            setattr(schema, g, f)
+            f = {"name": g, "type": Any, "desc": f"Group by field: {g}"}
+            fields.append(f)
         ops = self.get_agg_field_names()
         for op in ops:
-            f = Field(desc=op)
-            setattr(schema, op, f)
-        return schema
+            f = {"name": op, "type": Any, "desc": f"Aggregate field: {op}"}
+            fields.append(f)
+        return create_schema_from_fields(fields)

palimpzest 0.7.21__py3-none-any.whl → 0.8.1__py3-none-any.whl

palimpzest 0.7.21py3-none-any.whl → 0.8.1py3-none-any.whl