PyPI - palimpzest - Versions diffs - 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

palimpzest 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

palimpzest/__init__.py +7 -9
palimpzest/constants.py +47 -7
palimpzest/core/__init__.py +20 -26
palimpzest/core/data/dataclasses.py +9 -2
palimpzest/core/data/datareaders.py +497 -0
palimpzest/core/elements/records.py +29 -37
palimpzest/core/lib/fields.py +14 -12
palimpzest/core/lib/schemas.py +80 -94
palimpzest/policy.py +58 -0
palimpzest/prompts/__init__.py +22 -0
palimpzest/prompts/code_synthesis_prompts.py +28 -0
palimpzest/prompts/convert_prompts.py +87 -0
palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
palimpzest/prompts/filter_prompts.py +69 -0
palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
palimpzest/prompts/prompt_factory.py +732 -0
palimpzest/prompts/util_phrases.py +14 -0
palimpzest/query/execution/execution_strategy.py +0 -3
palimpzest/query/execution/parallel_execution_strategy.py +12 -25
palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
palimpzest/query/generators/generators.py +71 -347
palimpzest/query/operators/__init__.py +5 -5
palimpzest/query/operators/aggregate.py +10 -5
palimpzest/query/operators/code_synthesis_convert.py +4 -48
palimpzest/query/operators/convert.py +5 -2
palimpzest/query/operators/critique_and_refine_convert.py +112 -0
palimpzest/query/operators/filter.py +1 -1
palimpzest/query/operators/limit.py +1 -1
palimpzest/query/operators/logical.py +28 -27
palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
palimpzest/query/operators/physical.py +32 -20
palimpzest/query/operators/project.py +1 -1
palimpzest/query/operators/rag_convert.py +6 -3
palimpzest/query/operators/retrieve.py +13 -31
palimpzest/query/operators/scan.py +150 -0
palimpzest/query/optimizer/__init__.py +5 -1
palimpzest/query/optimizer/cost_model.py +18 -34
palimpzest/query/optimizer/optimizer.py +40 -25
palimpzest/query/optimizer/optimizer_strategy.py +26 -0
palimpzest/query/optimizer/plan.py +2 -2
palimpzest/query/optimizer/rules.py +118 -27
palimpzest/query/processor/config.py +12 -1
palimpzest/query/processor/mab_sentinel_processor.py +125 -112
palimpzest/query/processor/nosentinel_processor.py +46 -62
palimpzest/query/processor/query_processor.py +10 -20
palimpzest/query/processor/query_processor_factory.py +12 -5
palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
palimpzest/query/processor/streaming_processor.py +11 -17
palimpzest/sets.py +170 -94
palimpzest/tools/pdfparser.py +5 -64
palimpzest/utils/datareader_helpers.py +61 -0
palimpzest/utils/field_helpers.py +69 -0
palimpzest/utils/hash_helpers.py +3 -2
palimpzest/utils/udfs.py +0 -28
{palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/METADATA +49 -49
palimpzest-0.6.0.dist-info/RECORD +87 -0
{palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/top_level.txt +0 -1
cli/README.md +0 -156
cli/__init__.py +0 -0
cli/cli_main.py +0 -390
palimpzest/config.py +0 -89
palimpzest/core/data/datasources.py +0 -369
palimpzest/datamanager/__init__.py +0 -0
palimpzest/datamanager/datamanager.py +0 -300
palimpzest/prompts.py +0 -397
palimpzest/query/operators/datasource.py +0 -202
palimpzest-0.5.3.dist-info/RECORD +0 -83
palimpzest-0.5.3.dist-info/entry_points.txt +0 -2
{palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/LICENSE +0 -0
{palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/WHEEL +0 -0

palimpzest/__init__.py CHANGED Viewed

@@ -1,7 +1,5 @@
-from palimpzest.constants import MAX_ROWS, Cardinality
-# data management
-from palimpzest.datamanager.datamanager import DataDirectory
+from palimpzest.constants import Cardinality
+from palimpzest.core.data.datareaders import DataReader
 from palimpzest.policy import (
     MaxQuality,
     MaxQualityAtFixedCost,
@@ -13,16 +11,14 @@ from palimpzest.policy import (
     PlanCost,
     Policy,
 )
-# dataset functionality
+from palimpzest.query.processor.config import QueryProcessorConfig
 from palimpzest.sets import Dataset
 __all__ = [
     # constants
-    "MAX_ROWS",
     "Cardinality",
-    # datamanager
-    "DataDirectory",
+    # core
+    "DataReader",
     # policy
     "MaxQuality",
     "MaxQualityAtFixedCost",
@@ -33,6 +29,8 @@ __all__ = [
     "MinTimeAtFixedQuality",
     "PlanCost",
     "Policy",
+    # query
+    "QueryProcessorConfig",
     # sets
     "Dataset",
 ]

palimpzest/constants.py CHANGED Viewed

@@ -27,14 +27,52 @@ class PromptStrategy(str, Enum):
     PromptStrategy describes the prompting technique to be used by a Generator when
     performing some task with a specified Model.
     """
+    # Chain-of-Thought Boolean Prompt Strategies
     COT_BOOL = "chain-of-thought-bool"
+    # COT_BOOL_CRITIC = "chain-of-thought-bool-critic"
+    # COT_BOOL_REFINE = "chain-of-thought-bool-refine"
+    # Chain-of-Thought Boolean with Image Prompt Strategies
     COT_BOOL_IMAGE = "chain-of-thought-bool-image"
+    # COT_BOOL_IMAGE_CRITIC = "chain-of-thought-bool-image-critic"
+    # COT_BOOL_IMAGE_REFINE = "chain-of-thought-bool-image-refine"
+    # Chain-of-Thought Question Answering Prompt Strategies
     COT_QA = "chain-of-thought-question"
+    COT_QA_CRITIC = "chain-of-thought-question-critic"
+    COT_QA_REFINE = "chain-of-thought-question-refine"
+    # Chain-of-Thought Question with Image Prompt Strategies
     COT_QA_IMAGE = "chain-of-thought-question-image"
+    COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
+    COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
+    # Mixture-of-Agents Prompt Strategies
     COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
     COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
     COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
+    def is_image_prompt(self):
+        return "image" in self.value
+    def is_cot_bool_prompt(self):
+        return "chain-of-thought-bool" in self.value
+    def is_cot_qa_prompt(self):
+        return "chain-of-thought-question" in self.value
+    def is_critic_prompt(self):
+        return "critic" in self.value
+    def is_refine_prompt(self):
+        return "refine" in self.value
+    def is_moa_proposer_prompt(self):
+        return "mixture-of-agents-proposer" in self.value
+    def is_moa_aggregator_prompt(self):
+        return "mixture-of-agents-aggregation" in self.value
 class AggFunc(str, Enum):
     COUNT = "count"
@@ -67,10 +105,11 @@ HTML_EXTENSIONS = [".html", ".htm"]
 # the number of seconds the parallel execution will sleep for while waiting for futures to complete
 PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS = 0.3
+# default PDF parser
+DEFAULT_PDF_PROCESSOR = "pypdf"
 # character limit for various IDs
 MAX_ID_CHARS = 10
-DEFAULT_DATASET_ID_CHARS = 16
-MAX_DATASET_ID_CHARS = 100
 # retry LLM executions 2^x * (multiplier) for up to 10 seconds and at most 4 times
 RETRY_MULTIPLIER = 2
@@ -98,9 +137,15 @@ LOCAL_SCAN_TIME_PER_KB = 1 / (float(500) * 1024)
 # Assume 30 GB/sec for sequential access of memory
 MEMORY_SCAN_TIME_PER_KB = 1 / (float(30) * 1024 * 1024)
+# Assume 1 KB per record
+NAIVE_BYTES_PER_RECORD = 1024
 # Rough conversion from # of characters --> # of tokens; assumes 1 token ~= 4 chars
 TOKENS_PER_CHARACTER = 0.25
+# Rough estimate of the number of tokens the context is allowed to take up for MIXTRAL and LLAMA3 models
+MIXTRAL_LLAMA_CONTEXT_TOKENS_LIMIT = 6000
 # a naive estimate for the input record size
 NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
@@ -128,11 +173,6 @@ NAIVE_PDF_PROCESSOR_TIME_PER_RECORD = 10.0
 # Whether or not to log LLM outputs
 LOG_LLM_OUTPUT = False
-# Derived schema prefix
-DERIVED_SCHEMA_PREFIX = "DerivedSchema_"
-# Derived source_id for records created from a DataFrame
-FROM_DF_PREFIX = "FROM_DF_"
 #### MODEL PERFORMANCE & COST METRICS ####
 # I've looked across models and grouped knowledge into commonly used categories:

palimpzest/core/__init__.py CHANGED Viewed

@@ -1,15 +1,13 @@
-from palimpzest.core.data.datasources import (
-    DataSource,
-    DirectorySource,
-    FileSource,
-    HTMLFileDirectorySource,
-    ImageFileDirectorySource,
-    MemorySource,
-    PDFFileDirectorySource,
-    TextFileDirectorySource,
-    UserSource,
-    ValidationDataSource,
-    XLSFileDirectorySource,
+from palimpzest.core.data.datareaders import (
+    DataReader,
+    DirectoryReader,
+    FileReader,
+    HTMLFileDirectoryReader,
+    ImageFileDirectoryReader,
+    MemoryReader,
+    PDFFileDirectoryReader,
+    TextFileDirectoryReader,
+    XLSFileDirectoryReader,
 )
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
 from palimpzest.core.lib.fields import (
@@ -33,7 +31,6 @@ from palimpzest.core.lib.schemas import (
     PlotImage,
     RawJSONObject,
     Schema,
-    SourceRecord,
     Table,
     TextFile,
     WebPage,
@@ -61,23 +58,20 @@ __all__ = [
     "PlotImage",
     "RawJSONObject",
     "Schema",
-    "SourceRecord",
     "Table",
     "TextFile",
     "WebPage",
     "XLSFile",
-    # datasources
-    "DataSource",
-    "DirectorySource",
-    "FileSource",
-    "HTMLFileDirectorySource",
-    "ImageFileDirectorySource",
-    "MemorySource",
-    "PDFFileDirectorySource",
-    "TextFileDirectorySource",
-    "UserSource",
-    "ValidationDataSource",
-    "XLSFileDirectorySource",
+    # datareaders
+    "DataReader",
+    "DirectoryReader",
+    "FileReader",
+    "HTMLFileDirectoryReader",
+    "ImageFileDirectoryReader",
+    "MemoryReader",
+    "PDFFileDirectoryReader",
+    "TextFileDirectoryReader",
+    "XLSFileDirectoryReader",
     # records
     "DataRecord",
     "DataRecordSet",

palimpzest/core/data/dataclasses.py CHANGED Viewed

@@ -124,8 +124,8 @@ class RecordOpStats:
     # identifier for the parent of this record
     record_parent_id: str
-    # idenifier for the source of this record
-    record_source_id: str
+    # idenifier for the source idx of this record
+    record_source_idx: str
     # a dictionary with the record state after being processed by the operator
     record_state: dict[str, Any]
@@ -406,6 +406,13 @@ class OperatorCostEstimates:
     # upper bound on quality
     quality_upper_bound: float | None = None
+    def __rmul__(self, multiplier: float) -> OperatorCostEstimates:
+        """
+        Multiply all fields by a scalar.
+        """
+        dct = {field.name: getattr(self, field.name) * multiplier for field in fields(self)}
+        return OperatorCostEstimates(**dct)
     def __post_init__(self):
         if self.cardinality_lower_bound is None and self.cardinality_upper_bound is None:
             self.cardinality_lower_bound = self.cardinality

palimpzest 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

palimpzest 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl