PyPI - palimpzest - Versions diffs - 0.5.4__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

palimpzest 0.5.4py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

palimpzest/__init__.py +7 -9
palimpzest/constants.py +47 -7
palimpzest/core/__init__.py +20 -26
palimpzest/core/data/dataclasses.py +9 -2
palimpzest/core/data/datareaders.py +497 -0
palimpzest/core/elements/records.py +29 -37
palimpzest/core/lib/fields.py +14 -12
palimpzest/core/lib/schemas.py +80 -94
palimpzest/policy.py +58 -0
palimpzest/prompts/__init__.py +22 -0
palimpzest/prompts/code_synthesis_prompts.py +28 -0
palimpzest/prompts/convert_prompts.py +87 -0
palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
palimpzest/prompts/filter_prompts.py +69 -0
palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
palimpzest/prompts/prompt_factory.py +732 -0
palimpzest/prompts/util_phrases.py +14 -0
palimpzest/query/execution/execution_strategy.py +0 -3
palimpzest/query/execution/parallel_execution_strategy.py +12 -25
palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
palimpzest/query/generators/generators.py +71 -347
palimpzest/query/operators/__init__.py +5 -5
palimpzest/query/operators/aggregate.py +10 -5
palimpzest/query/operators/code_synthesis_convert.py +4 -48
palimpzest/query/operators/convert.py +5 -2
palimpzest/query/operators/critique_and_refine_convert.py +112 -0
palimpzest/query/operators/filter.py +1 -1
palimpzest/query/operators/limit.py +1 -1
palimpzest/query/operators/logical.py +28 -27
palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
palimpzest/query/operators/physical.py +32 -20
palimpzest/query/operators/project.py +1 -1
palimpzest/query/operators/rag_convert.py +6 -3
palimpzest/query/operators/retrieve.py +13 -31
palimpzest/query/operators/scan.py +150 -0
palimpzest/query/optimizer/__init__.py +5 -1
palimpzest/query/optimizer/cost_model.py +18 -34
palimpzest/query/optimizer/optimizer.py +40 -25
palimpzest/query/optimizer/optimizer_strategy.py +26 -0
palimpzest/query/optimizer/plan.py +2 -2
palimpzest/query/optimizer/rules.py +118 -27
palimpzest/query/processor/config.py +12 -1
palimpzest/query/processor/mab_sentinel_processor.py +125 -112
palimpzest/query/processor/nosentinel_processor.py +46 -62
palimpzest/query/processor/query_processor.py +10 -20
palimpzest/query/processor/query_processor_factory.py +12 -5
palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
palimpzest/query/processor/streaming_processor.py +11 -17
palimpzest/sets.py +170 -94
palimpzest/tools/pdfparser.py +5 -64
palimpzest/utils/datareader_helpers.py +61 -0
palimpzest/utils/field_helpers.py +69 -0
palimpzest/utils/hash_helpers.py +3 -2
palimpzest/utils/udfs.py +0 -28
{palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/METADATA +49 -49
palimpzest-0.6.0.dist-info/RECORD +87 -0
{palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/top_level.txt +0 -1
cli/README.md +0 -156
cli/__init__.py +0 -0
cli/cli_main.py +0 -390
palimpzest/config.py +0 -89
palimpzest/core/data/datasources.py +0 -369
palimpzest/datamanager/__init__.py +0 -0
palimpzest/datamanager/datamanager.py +0 -300
palimpzest/prompts.py +0 -397
palimpzest/query/operators/datasource.py +0 -202
palimpzest-0.5.4.dist-info/RECORD +0 -83
palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
{palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/LICENSE +0 -0
{palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/WHEEL +0 -0

palimpzest/sets.py CHANGED Viewed

@@ -1,18 +1,20 @@
 from __future__ import annotations
-import json
+from pathlib import Path
 from typing import Callable
 import pandas as pd
 from palimpzest.constants import AggFunc, Cardinality
-from palimpzest.core.data.datasources import DataSource
+from palimpzest.core.data.datareaders import DataReader
 from palimpzest.core.elements.filters import Filter
 from palimpzest.core.elements.groupbysig import GroupBySig
-from palimpzest.core.lib.schemas import DefaultSchema, Number, Schema
-from palimpzest.datamanager.datamanager import DataDirectory
+from palimpzest.core.lib.fields import ListField, StringField
+from palimpzest.core.lib.schemas import Number, Schema
+from palimpzest.policy import construct_policy_from_kwargs
 from palimpzest.query.processor.config import QueryProcessorConfig
-from palimpzest.utils.hash_helpers import hash_for_id
+from palimpzest.utils.datareader_helpers import get_local_datareader
+from palimpzest.utils.hash_helpers import hash_for_serialized_dict
 from palimpzest.utils.index_helpers import get_index_str
@@ -21,27 +23,11 @@ from palimpzest.utils.index_helpers import get_index_str
 #####################################################
 class Set:
     """
-    A Set is the logical abstraction for a set of DataRecords matching some Schema. It is
-    also a node in the computation graph of a Dataset.
-    Each Dataset consists of one or more Sets. The "initial" Set in a Dataset can be thought
-    of as the Set that results from reading each DataRecord unaltered from the source. For each
-    filter or transformation that is applied to the Dataset, a new Set is created which defines
-    the set of DataRecords that result from applying that filter or transformation. In brief,
-    the Sets define a Dataset's computation graph. Sets can also be cached to maximize the reuse
-    of past computation.
-    Sets are initialized with a dataset_id, a schema, and a source. The source is either an
-    existing Set or a raw data source (such as a directory or S3 prefix). Sets may be initialized
-    with a Filter (which defines the filtering performed on the source to obtain *this* Set),
-    and a description of what this Set is meant to represent.
     """
-    SET_VERSION = 0.1
     def __init__(
         self,
-        source: Set | DataSource,
+        source: Set | DataReader,
         schema: Schema,
         desc: str | None = None,
         filter: Filter | None = None,
@@ -49,10 +35,11 @@ class Set:
         agg_func: AggFunc | None = None,
         group_by: GroupBySig | None = None,
         project_cols: list[str] | None = None,
-        index = None, # TODO(Siva): Abstract Index and add a type here and elsewhere
+        index=None,  # TODO(Siva): Abstract Index and add a type here and elsewhere
+        search_func: Callable | None = None,
         search_attr: str | None = None,
         output_attr: str | None = None,
-        k: int | None = None, # TODO: disambiguate `k` to be something like `retrieve_k`
+        k: int | None = None,  # TODO: disambiguate `k` to be something like `retrieve_k`
         limit: int | None = None,
         cardinality: Cardinality = Cardinality.ONE_TO_ONE,
         depends_on: list[str] | None = None,
@@ -67,6 +54,7 @@ class Set:
         self._group_by = group_by
         self._project_cols = None if project_cols is None else sorted(project_cols)
         self._index = index
+        self._search_func = search_func
         self._search_attr = search_attr
         self._output_attr = output_attr
         self._k = k
@@ -75,24 +63,22 @@ class Set:
         self._depends_on = [] if depends_on is None else sorted(depends_on)
         self._nocache = nocache
-    def __str__(self):
-        return (
-            f"{self.__class__.__name__}(schema={self.schema}, desc={self._desc}, "
-            f"filter={str(self._filter)}, udf={str(self._udf)}, agg_func={str(self._agg_func)}, limit={str(self._limit)}, "
-            f"project_cols={str(self._project_cols)}, uid={self.universal_identifier()})"
-        )
     @property
     def schema(self) -> Schema:
         return self._schema
+    def _set_data_source(self, source: DataReader):
+        if isinstance(self._source, Set):
+            self._source._set_data_source(source)
+        else:
+            self._source = source
     def serialize(self):
         # NOTE: I needed to remove depends_on from the serialization dictionary because
         # the optimizer changes the name of the depends_on fields to be their "full" name.
         # This created an issue with the node.universal_identifier() not being consistent
         # after changing the field to its full name.
         d = {
-            "version": Set.SET_VERSION,
             "schema": self.schema.json_schema(),
             "source": self._source.serialize(),
             "desc": repr(self._desc),
@@ -104,6 +90,7 @@ class Set:
             "group_by": (None if self._group_by is None else self._group_by.serialize()),
             "project_cols": (None if self._project_cols is None else self._project_cols),
             "index": None if self._index is None else get_index_str(self._index),
+            "search_func": None if self._search_func is None else str(self._search_func),
             "search_attr": self._search_attr,
             "output_attr": self._output_attr,
             "k": self._k,
@@ -113,72 +100,56 @@ class Set:
     def universal_identifier(self):
         """Return a unique identifier for this Set."""
-        d = self.serialize()
-        ordered = json.dumps(d, sort_keys=True)
-        result = hash_for_id(ordered)
-        return result
+        return hash_for_serialized_dict(self.serialize())
     def json_schema(self):
         """Return the JSON schema for this Set."""
         return self.schema.json_schema()
 class Dataset(Set):
     """
-    A Dataset is the intended abstraction for programmers to interact with when manipulating Sets.
-    Users instantiate a Dataset by specifying a `source` that either points to a
-    DataSource or an existing cached Set. Users can then perform computations on
-    the Dataset in an imperative fashion by leveraging functions such as `filter`,
-    `convert`, `aggregate`, etc. Underneath the hood, each of these operations creates
-    a new Set which is cached by the DataManager. As a result, the Sets define the
-    lineage of computation on a Dataset, and this enables programmers to re-use
-    previously cached computation by providing it as a `source` to some future Dataset.
+    A Dataset is the intended abstraction for programmers to interact with when writing PZ programs.
+    Users instantiate a Dataset by specifying a `source` that either points to a DataReader
+    or an existing Dataset. Users can then perform computations on the Dataset in a lazy fashion
+    by leveraging functions such as `filter`, `sem_filter`, `sem_add_columns`, `aggregate`, etc.
+    Underneath the hood, each of these operations creates a new Dataset. As a result, the Dataset
+    defines a lineage of computation.
     """
-    def __init__(self, source: str | list | pd.DataFrame | DataSource, schema: Schema | None = None, *args, **kwargs):
-        # convert source (str) -> source (DataSource) if need be
-        updated_source = DataDirectory().get_or_register_dataset(source) if isinstance(source, (str, list, pd.DataFrame)) else source
-        if schema is None:
-            schema = Schema.from_df(source) if isinstance(source, pd.DataFrame) else DefaultSchema
+    def __init__(
+        self,
+        source: str | Path | list | pd.DataFrame | DataReader | Dataset,
+        schema: Schema | None = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        # NOTE: this function currently assumes that DataReader will always be provided with a schema;
+        #       we will relax this assumption in a subsequent PR
+        # convert source into a DataReader
+        updated_source = get_local_datareader(source, **kwargs) if isinstance(source, (str, Path, list, pd.DataFrame)) else source
+        # get the schema
+        schema = updated_source.schema if schema is None else schema
         # intialize class
         super().__init__(updated_source, schema, *args, **kwargs)
-    def copy(self) -> Dataset:
-        source_copy = self._source.copy()
-        dataset_copy = Dataset(
-            schema=self.schema,
-            source=source_copy,
-            desc=self._desc,
-            filter=self._filter,
-            udf=self._udf,
-            agg_func=self._agg_func,
-            group_by=self._group_by,
-            index=self._index,
-            search_attr=self._search_attr,
-            output_attr=self._output_attr,
-            k=self._k,
-            limit=self._limit,
-            cardinality=self._cardinality,
-            depends_on=self._depends_on,
-            nocache=self._nocache,
-        )
-        return dataset_copy
     def filter(
         self,
-        _filter: str | Callable,
+        _filter: Callable,
         depends_on: str | list[str] | None = None,
     ) -> Dataset:
-        """Add a filter to the Set. This filter will possibly restrict the items that are returned later."""
+        """Add a user defined function as a filter to the Set. This filter will possibly restrict the items that are returned later."""
         f = None
-        if isinstance(_filter, str):
-            f = Filter(_filter)
-        elif callable(_filter):
+        if callable(_filter):
             f = Filter(filter_fn=_filter)
         else:
-            raise Exception("Filter type not supported.", type(_filter))
+            error_str = f"Only support callable for filter, currently got {type(_filter)}"
+            if isinstance(_filter, str):
+                error_str += ". Consider using sem_filter() for semantic filters."
+            raise Exception(error_str)
         if isinstance(depends_on, str):
             depends_on = [depends_on]
@@ -190,33 +161,115 @@ class Dataset(Set):
             depends_on=depends_on,
             nocache=self._nocache,
         )
-    def convert(
+    def sem_filter(
         self,
-        output_schema: Schema,
-        udf: Callable | None = None,
-        cardinality: Cardinality = Cardinality.ONE_TO_ONE,
+        _filter: str,
         depends_on: str | list[str] | None = None,
-        desc: str = "Convert to new schema",
     ) -> Dataset:
-        """Convert the Set to a new schema."""
+        """Add a natural language description of a filter to the Set. This filter will possibly restrict the items that are returned later."""
+        f = None
+        if isinstance(_filter, str):
+            f = Filter(_filter)
+        else:
+            raise Exception("sem_filter() only supports `str` input for _filter.", type(_filter))
         if isinstance(depends_on, str):
             depends_on = [depends_on]
         return Dataset(
             source=self,
-            schema=output_schema,
-            udf=udf,
+            schema=self.schema,
+            filter=f,
+            depends_on=depends_on,
+            nocache=self._nocache,
+        )
+    def sem_add_columns(self, cols: list[dict] | type[Schema],
+                        cardinality: Cardinality = Cardinality.ONE_TO_ONE,
+                        depends_on: str | list[str] | None = None,
+                        desc: str = "Add new columns via semantic reasoning") -> Dataset:
+        """
+        Add new columns by specifying the column names, descriptions, and types.
+        The column will be computed during the execution of the Dataset.
+        Example:
+            sem_add_columns(
+                [{'name': 'greeting', 'desc': 'The greeting message', 'type': str},
+                 {'name': 'age', 'desc': 'The age of the person', 'type': int},
+                 {'name': 'full_name', 'desc': 'The name of the person', 'type': str}]
+            )
+        """
+        if isinstance(depends_on, str):
+            depends_on = [depends_on]
+        new_output_schema = None
+        if isinstance(cols, list):
+            new_output_schema = self.schema.add_fields(cols)
+        elif issubclass(cols, Schema):
+            new_output_schema = self.schema.union(cols)
+        else:
+            raise ValueError("`cols` must be a list of dictionaries or a Schema.")
+        return Dataset(
+            source=self,
+            schema=new_output_schema,
+            udf=None,
             cardinality=cardinality,
             depends_on=depends_on,
             desc=desc,
             nocache=self._nocache,
         )
-    # This is a convenience for users who like DataFrames-like syntax.
-    def add_columns(self, columns:dict[str, str], cardinality: Cardinality = Cardinality.ONE_TO_ONE) -> Dataset:
-        new_output_schema = self.schema.add_fields(columns)
-        return self.convert(new_output_schema, udf=None, cardinality=cardinality, depends_on=None, desc="Add columns " + str(columns))
+    def add_columns(self, udf: Callable,
+                    cols: list[dict] | type[Schema],
+                    cardinality: Cardinality = Cardinality.ONE_TO_ONE,
+                    depends_on: str | list[str] | None = None,
+                    desc: str = "Add new columns via UDF") -> Dataset:
+        """
+        Add new columns by specifying UDFs.
+        Examples:
+            add_columns(
+                udf=compute_personal_greeting,
+                cols=[
+                    {'name': 'greeting', 'desc': 'The greeting message', 'type': str},
+                    {'name': 'age', 'desc': 'The age of the person', 'type': int},
+                    {'name': 'full_name', 'desc': 'The name of the person', 'type': str},
+                ]
+            )
+        """
+        if udf is None or cols is None:
+            raise ValueError("`udf` and `cols` must be provided for add_columns.")
+        if isinstance(depends_on, str):
+            depends_on = [depends_on]
+        new_output_schema = None
+        if isinstance(cols, list):
+            updated_cols = []
+            for col_dict in cols:
+                assert isinstance(col_dict, dict), "each entry in `cols` must be a dictionary"
+                assert "name" in col_dict, "each type must contain a 'name' key specifying the column name"
+                assert "type" in col_dict, "each type must contain a 'type' key specifying the column type"
+                col_dict["desc"] = col_dict.get("desc", "New column: " + col_dict["name"])
+                updated_cols.append(col_dict)
+            new_output_schema = self.schema.add_fields(updated_cols)
+        elif issubclass(cols, Schema):
+            new_output_schema = self.schema.union(cols)
+        else:
+            raise ValueError("`cols` must be a list of dictionaries or a Schema.")
+        return Dataset(
+            source=self,
+            schema=new_output_schema,
+            udf=udf,
+            cardinality=cardinality,
+            desc=desc,
+            depends_on=depends_on,
+            nocache=self._nocache,
+        )
     def count(self) -> Dataset:
         """Apply a count aggregation to this set"""
@@ -247,12 +300,27 @@ class Dataset(Set):
             nocache=self._nocache,
         )
-    def retrieve(self, output_schema, index, search_attr, output_attr, k=-1) -> Dataset:
+    def retrieve(
+        self, index, search_func: Callable, search_attr: str, output_attr: str, output_attr_desc: str, k=-1
+    ) -> Dataset:
+        """
+        Retrieve the top k nearest neighbors of the value of the `search_attr` from the index and
+        stores it in the `output_attr` field. The output schema is a union of the current schema
+        and the `output_attr` with type ListField(StringField). `search_func` is a function of
+        type (index, query: str | list(str), k: int) -> list[str]. It should implement the lookup
+        logic for the index and return the top k results. The value of the `search_attr` field is
+        used as the query to lookup in the index. The results are stored in the `output_attr`
+        field. `output_attr_desc` is the description of the `output_attr` field.
+        """
+        # Output schema is a union of the current schema and the output_attr
+        attributes = {output_attr: ListField(StringField)(desc=output_attr_desc)}
+        output_schema = self.schema().union(type("Schema", (Schema,), attributes))
         return Dataset(
             source=self,
             schema=output_schema,
             desc="Retrieve",
             index=index,
+            search_func=search_func,
             search_attr=search_attr,
             output_attr=output_attr,
             k=k,
@@ -278,6 +346,14 @@ class Dataset(Set):
             nocache=self._nocache,
         )
-    def run(self, config: QueryProcessorConfig | None = None, **kwargs): # noqa: F821
+    def run(self, config: QueryProcessorConfig | None = None, **kwargs):
+        """Invoke the QueryProcessor to execute the query. `kwargs` will be applied to the QueryProcessorConfig."""
+        # TODO: this import currently needs to be here to avoid a circular import; we should fix this in a subsequent PR
         from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory
+        # as syntactic sugar, we will allow some keyword arguments to parameterize our policies
+        policy = construct_policy_from_kwargs(**kwargs)
+        if policy is not None:
+            kwargs["policy"] = policy
         return QueryProcessorFactory.create_and_run_processor(self, config, **kwargs)

palimpzest/tools/pdfparser.py CHANGED Viewed

@@ -3,7 +3,7 @@ import io
 import json
 import os
 import time
-from typing import BinaryIO, List
+from typing import BinaryIO
 from zipfile import ZipFile
 import pandas as pd
@@ -11,32 +11,9 @@ import requests
 from fastapi import status
 from pypdf import PdfReader
-from palimpzest.config import Config
 COSMOS_ADDRESS = "https://xdd.wisc.edu/cosmos_service"
-class PdfParser:
-    def __init__(self, pdf_path: str):
-        self.pdf_path = pdf_path
-        with open(pdf_path, "rb") as f:
-            self.pdf = f.read()
-        self.text = ""
-        self.pages = []
-        self._parse()
-    def _parse(self):
-        for page in self.pdf:
-            self.text += page.get_text()  # type: ignore
-            self.pages.append(page.get_text())  # type: ignore
-    def get_text(self) -> str:
-        return self.text
-    def get_pages(self) -> List[str]:
-        return self.pages
 def get_md5(file_bytes: bytes) -> str:
     if not isinstance(file_bytes, bytes):
         file_bytes = file_bytes.encode()
@@ -209,15 +186,9 @@ def cosmos_client(name: str, data: BinaryIO, output_dir: str, delay=10):
 # 1. Check if the text file already exists in the cache, if so, read from the cache
 # 2. If not, call the cosmos_client function to process the PDF file and cache the text file
 ##
-# NOTE: I don't believe anyone actively depends on this function, but we need to remove the
-# dependency on DataDirectory() in order to prevent circular imports. The long-term solution
-# is to separate out the pieces of DataDirectory which the DataSources depend on, from the
-# pieces which are related to setting / reading external configurations (like "pdfprocessor").
-# However, given that I can fix this in two minutes by adding this is a kwarg, I'm going to
-# do that for now and revisit the issue if/when this matters.
 # TODO(Jun): 1. cosmos returns 202 for me. 2. why only accept "pypdf" and "cosmos" as pdfprocessor?
-def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="cosmos", enable_file_cache=True, file_cache_dir="/tmp"):
+def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="pypdf", enable_file_cache=True, file_cache_dir="/tmp"):
     pdf_filename = filename
     file_name = os.path.basename(pdf_filename)
     file_name_without_extension = os.path.splitext(file_name)[0]
@@ -229,11 +200,12 @@ def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="cosmos", enable_file_ca
         for page in pdf.pages:
             all_text += page.extract_text() + "\n"
         return all_text
-        # return pdf.pages[0].extract_text() # TODO we can only return first page
     else:
         # Get md5 of the pdf_bytes
         md5 = get_md5(pdf_bytes)
         cached_extraction_folder = f"COSMOS_{os.path.splitext(file_name)[0].replace(' ', '_')}_{md5}"
         # Check if pz_file_cache_dir exists in the file system
         pz_file_cache_dir = os.path.join(file_cache_dir, cached_extraction_folder)
         if enable_file_cache and os.path.exists(pz_file_cache_dir):
@@ -243,43 +215,12 @@ def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="cosmos", enable_file_ca
                 text_content = file.read()
                 return text_content
-        #
-        # CHUNWEI: This code has a bug
-        # It checks to see if the text file name is in the registry, but there are two things wrong here.
-        # 1) The registry is for 'official' datasets that have been inserted by the user, not cached objects.
-        # 2) The filename isn't enough to check for cached results. Maybe the file moved directories, or maybe there are
-        # multiple different files with the same name. You need the checksum of the original file to ensure the cached
-        # object is valid.
-        #
-        #    if DataDirectory().exists(text_file_name):
-        #        print(f"Text file {text_file_name} already exists, reading from cache")
-        #        text_file_path = DataDirectory().get_path(text_file_name)
-        #        with open(text_file_path, 'r') as file:
-        #            text_content = file.read()
-        #            return text_content
-        # cosmos_file_dir = file_name_without_extension.replace(" ", "_")
-        # get a tmp of the system temp directory
-        print(f"Processing {file_name} through COSMOS")
         # Call the cosmos_client function
+        print(f"Processing {file_name} through COSMOS")
         cosmos_client(file_name, pdf_bytes, file_cache_dir)
         text_file_path = os.path.join(pz_file_cache_dir, text_file_name)
         if not os.path.exists(text_file_path):
             raise FileNotFoundError(f"Text file {text_file_name} not found in {pz_file_cache_dir}/{text_file_name}")
-        # DataDirectory().register_local_file(text_file_path, text_file_name)
         with open(text_file_path) as file:
             text_content = file.read()
             return text_content
-if __name__ == "__main__":
-    config = Config("default")
-    file_path = "../../../testdata/pdfs-tiny/battery.pdf"
-    # output_dir = "../../../tests/testFileDirectory/cosmos"
-    with open(file_path, "rb") as file:
-        text = get_text_from_pdf(file_path, file.read())
-        print(text)
-        # file_name = os.path.basename(file_path)
-        # # Call the cosmos_client function
-        # cosmos_client(file_name, file, output_dir)
-    # DataDirectory().rm_registered_dataset("sidarthe.annotations.txt")

palimpzest/utils/datareader_helpers.py ADDED Viewed

@@ -0,0 +1,61 @@
+import os
+from pathlib import Path
+import pandas as pd
+from palimpzest import constants
+from palimpzest.core.data.datareaders import (
+    DataReader,
+    FileReader,
+    HTMLFileDirectoryReader,
+    ImageFileDirectoryReader,
+    MemoryReader,
+    PDFFileDirectoryReader,
+    TextFileDirectoryReader,
+    XLSFileDirectoryReader,
+)
+def get_local_source(path: str | Path, **kwargs) -> DataReader:
+    """Return a DataReader for a local file or directory."""
+    if os.path.isfile(path):
+        return FileReader(path)
+    elif os.path.isdir(path):
+        if all([f.endswith(tuple(constants.IMAGE_EXTENSIONS)) for f in os.listdir(path)]):
+            return ImageFileDirectoryReader(path)
+        elif all([f.endswith(tuple(constants.PDF_EXTENSIONS)) for f in os.listdir(path)]):
+            pdfprocessor = kwargs.get("pdfprocessor", constants.DEFAULT_PDF_PROCESSOR)
+            file_cache_dir = kwargs.get("file_cache_dir", "/tmp")
+            return PDFFileDirectoryReader(
+                path=path, pdfprocessor=pdfprocessor, file_cache_dir=file_cache_dir
+            )
+        elif all([f.endswith(tuple(constants.XLS_EXTENSIONS)) for f in os.listdir(path)]):
+            return XLSFileDirectoryReader(path)
+        elif all([f.endswith(tuple(constants.HTML_EXTENSIONS)) for f in os.listdir(path)]):
+            return HTMLFileDirectoryReader(path)
+        else:
+            return TextFileDirectoryReader(path)
+    else:
+        raise Exception(f"Path {path} is invalid. Does not point to a file or directory.")
+def get_local_datareader(source: str | Path | list | pd.DataFrame, **kwargs) -> DataReader:
+    """
+    This helper function returns a `DataReader` object based on the `source` type.
+    The returned `DataReader` object is guaranteed to have a schema.
+    """
+    if isinstance(source, (str, Path)):
+        source = get_local_source(source, **kwargs)
+    elif isinstance(source, (list, pd.DataFrame)):
+        source = MemoryReader(source)
+    else:
+        raise Exception(f"Invalid source type: {type(source)}, We only support str, Path, list[dict], and pd.DataFrame")
+    return source

palimpzest/utils/field_helpers.py ADDED Viewed

@@ -0,0 +1,69 @@
+import types
+from palimpzest.core.lib.fields import (
+    BooleanField,
+    BytesField,
+    Field,
+    FloatField,
+    IntField,
+    ListField,
+    NumericField,
+    StringField,
+)
+def assert_valid_field_type(field_type: type | types.UnionType | types.GenericAlias | Field) -> str:
+    """
+    Assert that the field is a valid field type. Return "pz_type" if field_type is a PZ type
+    and "python_type" if it is a Python type.
+    """
+    try:
+        assert issubclass(field_type, Field), "type must be a Python type or palimpzest.core.lib.fields.Field"
+        return "pz_type"
+    except Exception:
+        assert isinstance(field_type, (type, types.UnionType, types.GenericAlias)), "type must be a Python type or palimpzest.core.lib.fields.Field"
+    return "python_type"
+def construct_field_type(field_type: type | types.UnionType | types.GenericAlias | Field, desc: str) -> Field:
+    """Convert a field type and description to the corresponding PZ field.
+    Args:
+        type: type for the field (e.g. str, bool, list[int], StringField, etc.)
+        desc: description used in the field constructor
+    Returns:
+        Corresponding Field class
+    Raises:
+        ValueError: If the type is not recognized
+    """
+    # if field_type is a PZ type, construct and return the field
+    if assert_valid_field_type(field_type) == "pz_type":
+        return field_type(desc=desc)
+    # otherwise, map the Python type to a PZ type and construct the field
+    supported_types_map = {
+        str: StringField,
+        bool: BooleanField,
+        int: IntField,
+        float: FloatField,
+        int | float: NumericField,
+        bytes: BytesField,
+        list[str]: ListField(StringField),
+        list[bool]: ListField(BooleanField),
+        list[int]: ListField(IntField),
+        list[float]: ListField(FloatField),
+        list[int | float]: ListField(NumericField),
+        list[bytes]: ListField(BytesField),
+    }
+    if field_type not in supported_types_map:
+        raise ValueError(f"Unsupported type: {field_type}. Supported types are: {list(supported_types_map.keys())}")
+    # get the field class and (if applicable) element field class
+    field_cls = supported_types_map[field_type]
+    # construct and return the field
+    return field_cls(desc=desc)

palimpzest/utils/hash_helpers.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import hashlib
+import json
 from palimpzest.constants import MAX_ID_CHARS
@@ -7,5 +8,5 @@ def hash_for_id(id_str: str, max_chars: int = MAX_ID_CHARS) -> str:
     return hashlib.sha256(id_str.encode("utf-8")).hexdigest()[:max_chars]
-def hash_for_temp_schema(id_str:str) ->str:
-    return hash_for_id(id_str)
+def hash_for_serialized_dict(dict_obj: dict) -> str:
+    return hash_for_id(json.dumps(dict_obj, sort_keys=True))

palimpzest 0.5.4__py3-none-any.whl → 0.6.0__py3-none-any.whl

palimpzest 0.5.4py3-none-any.whl → 0.6.0py3-none-any.whl