palimpzest 0.5.4__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- palimpzest/__init__.py +7 -9
- palimpzest/constants.py +47 -7
- palimpzest/core/__init__.py +20 -26
- palimpzest/core/data/dataclasses.py +9 -2
- palimpzest/core/data/datareaders.py +497 -0
- palimpzest/core/elements/records.py +29 -37
- palimpzest/core/lib/fields.py +14 -12
- palimpzest/core/lib/schemas.py +80 -94
- palimpzest/policy.py +58 -0
- palimpzest/prompts/__init__.py +22 -0
- palimpzest/prompts/code_synthesis_prompts.py +28 -0
- palimpzest/prompts/convert_prompts.py +87 -0
- palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
- palimpzest/prompts/filter_prompts.py +69 -0
- palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
- palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
- palimpzest/prompts/prompt_factory.py +732 -0
- palimpzest/prompts/util_phrases.py +14 -0
- palimpzest/query/execution/execution_strategy.py +0 -3
- palimpzest/query/execution/parallel_execution_strategy.py +12 -25
- palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
- palimpzest/query/generators/generators.py +71 -347
- palimpzest/query/operators/__init__.py +5 -5
- palimpzest/query/operators/aggregate.py +10 -5
- palimpzest/query/operators/code_synthesis_convert.py +4 -48
- palimpzest/query/operators/convert.py +5 -2
- palimpzest/query/operators/critique_and_refine_convert.py +112 -0
- palimpzest/query/operators/filter.py +1 -1
- palimpzest/query/operators/limit.py +1 -1
- palimpzest/query/operators/logical.py +28 -27
- palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
- palimpzest/query/operators/physical.py +32 -20
- palimpzest/query/operators/project.py +1 -1
- palimpzest/query/operators/rag_convert.py +6 -3
- palimpzest/query/operators/retrieve.py +13 -31
- palimpzest/query/operators/scan.py +150 -0
- palimpzest/query/optimizer/__init__.py +5 -1
- palimpzest/query/optimizer/cost_model.py +18 -34
- palimpzest/query/optimizer/optimizer.py +40 -25
- palimpzest/query/optimizer/optimizer_strategy.py +26 -0
- palimpzest/query/optimizer/plan.py +2 -2
- palimpzest/query/optimizer/rules.py +118 -27
- palimpzest/query/processor/config.py +12 -1
- palimpzest/query/processor/mab_sentinel_processor.py +125 -112
- palimpzest/query/processor/nosentinel_processor.py +46 -62
- palimpzest/query/processor/query_processor.py +10 -20
- palimpzest/query/processor/query_processor_factory.py +12 -5
- palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
- palimpzest/query/processor/streaming_processor.py +11 -17
- palimpzest/sets.py +170 -94
- palimpzest/tools/pdfparser.py +5 -64
- palimpzest/utils/datareader_helpers.py +61 -0
- palimpzest/utils/field_helpers.py +69 -0
- palimpzest/utils/hash_helpers.py +3 -2
- palimpzest/utils/udfs.py +0 -28
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/METADATA +49 -49
- palimpzest-0.6.0.dist-info/RECORD +87 -0
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/top_level.txt +0 -1
- cli/README.md +0 -156
- cli/__init__.py +0 -0
- cli/cli_main.py +0 -390
- palimpzest/config.py +0 -89
- palimpzest/core/data/datasources.py +0 -369
- palimpzest/datamanager/__init__.py +0 -0
- palimpzest/datamanager/datamanager.py +0 -300
- palimpzest/prompts.py +0 -397
- palimpzest/query/operators/datasource.py +0 -202
- palimpzest-0.5.4.dist-info/RECORD +0 -83
- palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/LICENSE +0 -0
- {palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/WHEEL +0 -0
palimpzest/__init__.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
from palimpzest.constants import
|
|
2
|
-
|
|
3
|
-
# data management
|
|
4
|
-
from palimpzest.datamanager.datamanager import DataDirectory
|
|
1
|
+
from palimpzest.constants import Cardinality
|
|
2
|
+
from palimpzest.core.data.datareaders import DataReader
|
|
5
3
|
from palimpzest.policy import (
|
|
6
4
|
MaxQuality,
|
|
7
5
|
MaxQualityAtFixedCost,
|
|
@@ -13,16 +11,14 @@ from palimpzest.policy import (
|
|
|
13
11
|
PlanCost,
|
|
14
12
|
Policy,
|
|
15
13
|
)
|
|
16
|
-
|
|
17
|
-
# dataset functionality
|
|
14
|
+
from palimpzest.query.processor.config import QueryProcessorConfig
|
|
18
15
|
from palimpzest.sets import Dataset
|
|
19
16
|
|
|
20
17
|
__all__ = [
|
|
21
18
|
# constants
|
|
22
|
-
"MAX_ROWS",
|
|
23
19
|
"Cardinality",
|
|
24
|
-
#
|
|
25
|
-
"
|
|
20
|
+
# core
|
|
21
|
+
"DataReader",
|
|
26
22
|
# policy
|
|
27
23
|
"MaxQuality",
|
|
28
24
|
"MaxQualityAtFixedCost",
|
|
@@ -33,6 +29,8 @@ __all__ = [
|
|
|
33
29
|
"MinTimeAtFixedQuality",
|
|
34
30
|
"PlanCost",
|
|
35
31
|
"Policy",
|
|
32
|
+
# query
|
|
33
|
+
"QueryProcessorConfig",
|
|
36
34
|
# sets
|
|
37
35
|
"Dataset",
|
|
38
36
|
]
|
palimpzest/constants.py
CHANGED
|
@@ -27,14 +27,52 @@ class PromptStrategy(str, Enum):
|
|
|
27
27
|
PromptStrategy describes the prompting technique to be used by a Generator when
|
|
28
28
|
performing some task with a specified Model.
|
|
29
29
|
"""
|
|
30
|
+
# Chain-of-Thought Boolean Prompt Strategies
|
|
30
31
|
COT_BOOL = "chain-of-thought-bool"
|
|
32
|
+
# COT_BOOL_CRITIC = "chain-of-thought-bool-critic"
|
|
33
|
+
# COT_BOOL_REFINE = "chain-of-thought-bool-refine"
|
|
34
|
+
|
|
35
|
+
# Chain-of-Thought Boolean with Image Prompt Strategies
|
|
31
36
|
COT_BOOL_IMAGE = "chain-of-thought-bool-image"
|
|
37
|
+
# COT_BOOL_IMAGE_CRITIC = "chain-of-thought-bool-image-critic"
|
|
38
|
+
# COT_BOOL_IMAGE_REFINE = "chain-of-thought-bool-image-refine"
|
|
39
|
+
|
|
40
|
+
# Chain-of-Thought Question Answering Prompt Strategies
|
|
32
41
|
COT_QA = "chain-of-thought-question"
|
|
42
|
+
COT_QA_CRITIC = "chain-of-thought-question-critic"
|
|
43
|
+
COT_QA_REFINE = "chain-of-thought-question-refine"
|
|
44
|
+
|
|
45
|
+
# Chain-of-Thought Question with Image Prompt Strategies
|
|
33
46
|
COT_QA_IMAGE = "chain-of-thought-question-image"
|
|
47
|
+
COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
|
|
48
|
+
COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
|
|
49
|
+
|
|
50
|
+
# Mixture-of-Agents Prompt Strategies
|
|
34
51
|
COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
|
|
35
52
|
COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
|
|
36
53
|
COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
|
|
37
54
|
|
|
55
|
+
def is_image_prompt(self):
|
|
56
|
+
return "image" in self.value
|
|
57
|
+
|
|
58
|
+
def is_cot_bool_prompt(self):
|
|
59
|
+
return "chain-of-thought-bool" in self.value
|
|
60
|
+
|
|
61
|
+
def is_cot_qa_prompt(self):
|
|
62
|
+
return "chain-of-thought-question" in self.value
|
|
63
|
+
|
|
64
|
+
def is_critic_prompt(self):
|
|
65
|
+
return "critic" in self.value
|
|
66
|
+
|
|
67
|
+
def is_refine_prompt(self):
|
|
68
|
+
return "refine" in self.value
|
|
69
|
+
|
|
70
|
+
def is_moa_proposer_prompt(self):
|
|
71
|
+
return "mixture-of-agents-proposer" in self.value
|
|
72
|
+
|
|
73
|
+
def is_moa_aggregator_prompt(self):
|
|
74
|
+
return "mixture-of-agents-aggregation" in self.value
|
|
75
|
+
|
|
38
76
|
|
|
39
77
|
class AggFunc(str, Enum):
|
|
40
78
|
COUNT = "count"
|
|
@@ -67,10 +105,11 @@ HTML_EXTENSIONS = [".html", ".htm"]
|
|
|
67
105
|
# the number of seconds the parallel execution will sleep for while waiting for futures to complete
|
|
68
106
|
PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS = 0.3
|
|
69
107
|
|
|
108
|
+
# default PDF parser
|
|
109
|
+
DEFAULT_PDF_PROCESSOR = "pypdf"
|
|
110
|
+
|
|
70
111
|
# character limit for various IDs
|
|
71
112
|
MAX_ID_CHARS = 10
|
|
72
|
-
DEFAULT_DATASET_ID_CHARS = 16
|
|
73
|
-
MAX_DATASET_ID_CHARS = 100
|
|
74
113
|
|
|
75
114
|
# retry LLM executions 2^x * (multiplier) for up to 10 seconds and at most 4 times
|
|
76
115
|
RETRY_MULTIPLIER = 2
|
|
@@ -98,9 +137,15 @@ LOCAL_SCAN_TIME_PER_KB = 1 / (float(500) * 1024)
|
|
|
98
137
|
# Assume 30 GB/sec for sequential access of memory
|
|
99
138
|
MEMORY_SCAN_TIME_PER_KB = 1 / (float(30) * 1024 * 1024)
|
|
100
139
|
|
|
140
|
+
# Assume 1 KB per record
|
|
141
|
+
NAIVE_BYTES_PER_RECORD = 1024
|
|
142
|
+
|
|
101
143
|
# Rough conversion from # of characters --> # of tokens; assumes 1 token ~= 4 chars
|
|
102
144
|
TOKENS_PER_CHARACTER = 0.25
|
|
103
145
|
|
|
146
|
+
# Rough estimate of the number of tokens the context is allowed to take up for MIXTRAL and LLAMA3 models
|
|
147
|
+
MIXTRAL_LLAMA_CONTEXT_TOKENS_LIMIT = 6000
|
|
148
|
+
|
|
104
149
|
# a naive estimate for the input record size
|
|
105
150
|
NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
|
|
106
151
|
|
|
@@ -128,11 +173,6 @@ NAIVE_PDF_PROCESSOR_TIME_PER_RECORD = 10.0
|
|
|
128
173
|
# Whether or not to log LLM outputs
|
|
129
174
|
LOG_LLM_OUTPUT = False
|
|
130
175
|
|
|
131
|
-
# Derived schema prefix
|
|
132
|
-
DERIVED_SCHEMA_PREFIX = "DerivedSchema_"
|
|
133
|
-
|
|
134
|
-
# Derived source_id for records created from a DataFrame
|
|
135
|
-
FROM_DF_PREFIX = "FROM_DF_"
|
|
136
176
|
|
|
137
177
|
#### MODEL PERFORMANCE & COST METRICS ####
|
|
138
178
|
# I've looked across models and grouped knowledge into commonly used categories:
|
palimpzest/core/__init__.py
CHANGED
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
from palimpzest.core.data.
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
ValidationDataSource,
|
|
12
|
-
XLSFileDirectorySource,
|
|
1
|
+
from palimpzest.core.data.datareaders import (
|
|
2
|
+
DataReader,
|
|
3
|
+
DirectoryReader,
|
|
4
|
+
FileReader,
|
|
5
|
+
HTMLFileDirectoryReader,
|
|
6
|
+
ImageFileDirectoryReader,
|
|
7
|
+
MemoryReader,
|
|
8
|
+
PDFFileDirectoryReader,
|
|
9
|
+
TextFileDirectoryReader,
|
|
10
|
+
XLSFileDirectoryReader,
|
|
13
11
|
)
|
|
14
12
|
from palimpzest.core.elements.records import DataRecord, DataRecordSet
|
|
15
13
|
from palimpzest.core.lib.fields import (
|
|
@@ -33,7 +31,6 @@ from palimpzest.core.lib.schemas import (
|
|
|
33
31
|
PlotImage,
|
|
34
32
|
RawJSONObject,
|
|
35
33
|
Schema,
|
|
36
|
-
SourceRecord,
|
|
37
34
|
Table,
|
|
38
35
|
TextFile,
|
|
39
36
|
WebPage,
|
|
@@ -61,23 +58,20 @@ __all__ = [
|
|
|
61
58
|
"PlotImage",
|
|
62
59
|
"RawJSONObject",
|
|
63
60
|
"Schema",
|
|
64
|
-
"SourceRecord",
|
|
65
61
|
"Table",
|
|
66
62
|
"TextFile",
|
|
67
63
|
"WebPage",
|
|
68
64
|
"XLSFile",
|
|
69
|
-
#
|
|
70
|
-
"
|
|
71
|
-
"
|
|
72
|
-
"
|
|
73
|
-
"
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"ValidationDataSource",
|
|
80
|
-
"XLSFileDirectorySource",
|
|
65
|
+
# datareaders
|
|
66
|
+
"DataReader",
|
|
67
|
+
"DirectoryReader",
|
|
68
|
+
"FileReader",
|
|
69
|
+
"HTMLFileDirectoryReader",
|
|
70
|
+
"ImageFileDirectoryReader",
|
|
71
|
+
"MemoryReader",
|
|
72
|
+
"PDFFileDirectoryReader",
|
|
73
|
+
"TextFileDirectoryReader",
|
|
74
|
+
"XLSFileDirectoryReader",
|
|
81
75
|
# records
|
|
82
76
|
"DataRecord",
|
|
83
77
|
"DataRecordSet",
|
|
@@ -124,8 +124,8 @@ class RecordOpStats:
|
|
|
124
124
|
# identifier for the parent of this record
|
|
125
125
|
record_parent_id: str
|
|
126
126
|
|
|
127
|
-
# idenifier for the source of this record
|
|
128
|
-
|
|
127
|
+
# idenifier for the source idx of this record
|
|
128
|
+
record_source_idx: str
|
|
129
129
|
|
|
130
130
|
# a dictionary with the record state after being processed by the operator
|
|
131
131
|
record_state: dict[str, Any]
|
|
@@ -406,6 +406,13 @@ class OperatorCostEstimates:
|
|
|
406
406
|
# upper bound on quality
|
|
407
407
|
quality_upper_bound: float | None = None
|
|
408
408
|
|
|
409
|
+
def __rmul__(self, multiplier: float) -> OperatorCostEstimates:
|
|
410
|
+
"""
|
|
411
|
+
Multiply all fields by a scalar.
|
|
412
|
+
"""
|
|
413
|
+
dct = {field.name: getattr(self, field.name) * multiplier for field in fields(self)}
|
|
414
|
+
return OperatorCostEstimates(**dct)
|
|
415
|
+
|
|
409
416
|
def __post_init__(self):
|
|
410
417
|
if self.cardinality_lower_bound is None and self.cardinality_upper_bound is None:
|
|
411
418
|
self.cardinality_lower_bound = self.cardinality
|