palimpzest 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. palimpzest/__init__.py +7 -9
  2. palimpzest/constants.py +47 -7
  3. palimpzest/core/__init__.py +20 -26
  4. palimpzest/core/data/dataclasses.py +9 -2
  5. palimpzest/core/data/datareaders.py +497 -0
  6. palimpzest/core/elements/records.py +29 -37
  7. palimpzest/core/lib/fields.py +14 -12
  8. palimpzest/core/lib/schemas.py +80 -94
  9. palimpzest/policy.py +58 -0
  10. palimpzest/prompts/__init__.py +22 -0
  11. palimpzest/prompts/code_synthesis_prompts.py +28 -0
  12. palimpzest/prompts/convert_prompts.py +87 -0
  13. palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
  14. palimpzest/prompts/filter_prompts.py +69 -0
  15. palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
  16. palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
  17. palimpzest/prompts/prompt_factory.py +732 -0
  18. palimpzest/prompts/util_phrases.py +14 -0
  19. palimpzest/query/execution/execution_strategy.py +0 -3
  20. palimpzest/query/execution/parallel_execution_strategy.py +12 -25
  21. palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
  22. palimpzest/query/generators/generators.py +71 -347
  23. palimpzest/query/operators/__init__.py +5 -5
  24. palimpzest/query/operators/aggregate.py +10 -5
  25. palimpzest/query/operators/code_synthesis_convert.py +4 -48
  26. palimpzest/query/operators/convert.py +5 -2
  27. palimpzest/query/operators/critique_and_refine_convert.py +112 -0
  28. palimpzest/query/operators/filter.py +1 -1
  29. palimpzest/query/operators/limit.py +1 -1
  30. palimpzest/query/operators/logical.py +28 -27
  31. palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
  32. palimpzest/query/operators/physical.py +32 -20
  33. palimpzest/query/operators/project.py +1 -1
  34. palimpzest/query/operators/rag_convert.py +6 -3
  35. palimpzest/query/operators/retrieve.py +13 -31
  36. palimpzest/query/operators/scan.py +150 -0
  37. palimpzest/query/optimizer/__init__.py +5 -1
  38. palimpzest/query/optimizer/cost_model.py +18 -34
  39. palimpzest/query/optimizer/optimizer.py +40 -25
  40. palimpzest/query/optimizer/optimizer_strategy.py +26 -0
  41. palimpzest/query/optimizer/plan.py +2 -2
  42. palimpzest/query/optimizer/rules.py +118 -27
  43. palimpzest/query/processor/config.py +12 -1
  44. palimpzest/query/processor/mab_sentinel_processor.py +125 -112
  45. palimpzest/query/processor/nosentinel_processor.py +46 -62
  46. palimpzest/query/processor/query_processor.py +10 -20
  47. palimpzest/query/processor/query_processor_factory.py +12 -5
  48. palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
  49. palimpzest/query/processor/streaming_processor.py +11 -17
  50. palimpzest/sets.py +170 -94
  51. palimpzest/tools/pdfparser.py +5 -64
  52. palimpzest/utils/datareader_helpers.py +61 -0
  53. palimpzest/utils/field_helpers.py +69 -0
  54. palimpzest/utils/hash_helpers.py +3 -2
  55. palimpzest/utils/udfs.py +0 -28
  56. {palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/METADATA +49 -49
  57. palimpzest-0.6.0.dist-info/RECORD +87 -0
  58. {palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/top_level.txt +0 -1
  59. cli/README.md +0 -156
  60. cli/__init__.py +0 -0
  61. cli/cli_main.py +0 -390
  62. palimpzest/config.py +0 -89
  63. palimpzest/core/data/datasources.py +0 -369
  64. palimpzest/datamanager/__init__.py +0 -0
  65. palimpzest/datamanager/datamanager.py +0 -300
  66. palimpzest/prompts.py +0 -397
  67. palimpzest/query/operators/datasource.py +0 -202
  68. palimpzest-0.5.3.dist-info/RECORD +0 -83
  69. palimpzest-0.5.3.dist-info/entry_points.txt +0 -2
  70. {palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/LICENSE +0 -0
  71. {palimpzest-0.5.3.dist-info → palimpzest-0.6.0.dist-info}/WHEEL +0 -0
palimpzest/__init__.py CHANGED
@@ -1,7 +1,5 @@
1
- from palimpzest.constants import MAX_ROWS, Cardinality
2
-
3
- # data management
4
- from palimpzest.datamanager.datamanager import DataDirectory
1
+ from palimpzest.constants import Cardinality
2
+ from palimpzest.core.data.datareaders import DataReader
5
3
  from palimpzest.policy import (
6
4
  MaxQuality,
7
5
  MaxQualityAtFixedCost,
@@ -13,16 +11,14 @@ from palimpzest.policy import (
13
11
  PlanCost,
14
12
  Policy,
15
13
  )
16
-
17
- # dataset functionality
14
+ from palimpzest.query.processor.config import QueryProcessorConfig
18
15
  from palimpzest.sets import Dataset
19
16
 
20
17
  __all__ = [
21
18
  # constants
22
- "MAX_ROWS",
23
19
  "Cardinality",
24
- # datamanager
25
- "DataDirectory",
20
+ # core
21
+ "DataReader",
26
22
  # policy
27
23
  "MaxQuality",
28
24
  "MaxQualityAtFixedCost",
@@ -33,6 +29,8 @@ __all__ = [
33
29
  "MinTimeAtFixedQuality",
34
30
  "PlanCost",
35
31
  "Policy",
32
+ # query
33
+ "QueryProcessorConfig",
36
34
  # sets
37
35
  "Dataset",
38
36
  ]
palimpzest/constants.py CHANGED
@@ -27,14 +27,52 @@ class PromptStrategy(str, Enum):
27
27
  PromptStrategy describes the prompting technique to be used by a Generator when
28
28
  performing some task with a specified Model.
29
29
  """
30
+ # Chain-of-Thought Boolean Prompt Strategies
30
31
  COT_BOOL = "chain-of-thought-bool"
32
+ # COT_BOOL_CRITIC = "chain-of-thought-bool-critic"
33
+ # COT_BOOL_REFINE = "chain-of-thought-bool-refine"
34
+
35
+ # Chain-of-Thought Boolean with Image Prompt Strategies
31
36
  COT_BOOL_IMAGE = "chain-of-thought-bool-image"
37
+ # COT_BOOL_IMAGE_CRITIC = "chain-of-thought-bool-image-critic"
38
+ # COT_BOOL_IMAGE_REFINE = "chain-of-thought-bool-image-refine"
39
+
40
+ # Chain-of-Thought Question Answering Prompt Strategies
32
41
  COT_QA = "chain-of-thought-question"
42
+ COT_QA_CRITIC = "chain-of-thought-question-critic"
43
+ COT_QA_REFINE = "chain-of-thought-question-refine"
44
+
45
+ # Chain-of-Thought Question with Image Prompt Strategies
33
46
  COT_QA_IMAGE = "chain-of-thought-question-image"
47
+ COT_QA_IMAGE_CRITIC = "chain-of-thought-question-critic-image"
48
+ COT_QA_IMAGE_REFINE = "chain-of-thought-question-refine-image"
49
+
50
+ # Mixture-of-Agents Prompt Strategies
34
51
  COT_MOA_PROPOSER = "chain-of-thought-mixture-of-agents-proposer"
35
52
  COT_MOA_PROPOSER_IMAGE = "chain-of-thought-mixture-of-agents-proposer-image"
36
53
  COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation"
37
54
 
55
+ def is_image_prompt(self):
56
+ return "image" in self.value
57
+
58
+ def is_cot_bool_prompt(self):
59
+ return "chain-of-thought-bool" in self.value
60
+
61
+ def is_cot_qa_prompt(self):
62
+ return "chain-of-thought-question" in self.value
63
+
64
+ def is_critic_prompt(self):
65
+ return "critic" in self.value
66
+
67
+ def is_refine_prompt(self):
68
+ return "refine" in self.value
69
+
70
+ def is_moa_proposer_prompt(self):
71
+ return "mixture-of-agents-proposer" in self.value
72
+
73
+ def is_moa_aggregator_prompt(self):
74
+ return "mixture-of-agents-aggregation" in self.value
75
+
38
76
 
39
77
  class AggFunc(str, Enum):
40
78
  COUNT = "count"
@@ -67,10 +105,11 @@ HTML_EXTENSIONS = [".html", ".htm"]
67
105
  # the number of seconds the parallel execution will sleep for while waiting for futures to complete
68
106
  PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS = 0.3
69
107
 
108
+ # default PDF parser
109
+ DEFAULT_PDF_PROCESSOR = "pypdf"
110
+
70
111
  # character limit for various IDs
71
112
  MAX_ID_CHARS = 10
72
- DEFAULT_DATASET_ID_CHARS = 16
73
- MAX_DATASET_ID_CHARS = 100
74
113
 
75
114
  # retry LLM executions 2^x * (multiplier) for up to 10 seconds and at most 4 times
76
115
  RETRY_MULTIPLIER = 2
@@ -98,9 +137,15 @@ LOCAL_SCAN_TIME_PER_KB = 1 / (float(500) * 1024)
98
137
  # Assume 30 GB/sec for sequential access of memory
99
138
  MEMORY_SCAN_TIME_PER_KB = 1 / (float(30) * 1024 * 1024)
100
139
 
140
+ # Assume 1 KB per record
141
+ NAIVE_BYTES_PER_RECORD = 1024
142
+
101
143
  # Rough conversion from # of characters --> # of tokens; assumes 1 token ~= 4 chars
102
144
  TOKENS_PER_CHARACTER = 0.25
103
145
 
146
+ # Rough estimate of the number of tokens the context is allowed to take up for MIXTRAL and LLAMA3 models
147
+ MIXTRAL_LLAMA_CONTEXT_TOKENS_LIMIT = 6000
148
+
104
149
  # a naive estimate for the input record size
105
150
  NAIVE_EST_SOURCE_RECORD_SIZE_IN_BYTES = 1_000_000
106
151
 
@@ -128,11 +173,6 @@ NAIVE_PDF_PROCESSOR_TIME_PER_RECORD = 10.0
128
173
  # Whether or not to log LLM outputs
129
174
  LOG_LLM_OUTPUT = False
130
175
 
131
- # Derived schema prefix
132
- DERIVED_SCHEMA_PREFIX = "DerivedSchema_"
133
-
134
- # Derived source_id for records created from a DataFrame
135
- FROM_DF_PREFIX = "FROM_DF_"
136
176
 
137
177
  #### MODEL PERFORMANCE & COST METRICS ####
138
178
  # I've looked across models and grouped knowledge into commonly used categories:
@@ -1,15 +1,13 @@
1
- from palimpzest.core.data.datasources import (
2
- DataSource,
3
- DirectorySource,
4
- FileSource,
5
- HTMLFileDirectorySource,
6
- ImageFileDirectorySource,
7
- MemorySource,
8
- PDFFileDirectorySource,
9
- TextFileDirectorySource,
10
- UserSource,
11
- ValidationDataSource,
12
- XLSFileDirectorySource,
1
+ from palimpzest.core.data.datareaders import (
2
+ DataReader,
3
+ DirectoryReader,
4
+ FileReader,
5
+ HTMLFileDirectoryReader,
6
+ ImageFileDirectoryReader,
7
+ MemoryReader,
8
+ PDFFileDirectoryReader,
9
+ TextFileDirectoryReader,
10
+ XLSFileDirectoryReader,
13
11
  )
14
12
  from palimpzest.core.elements.records import DataRecord, DataRecordSet
15
13
  from palimpzest.core.lib.fields import (
@@ -33,7 +31,6 @@ from palimpzest.core.lib.schemas import (
33
31
  PlotImage,
34
32
  RawJSONObject,
35
33
  Schema,
36
- SourceRecord,
37
34
  Table,
38
35
  TextFile,
39
36
  WebPage,
@@ -61,23 +58,20 @@ __all__ = [
61
58
  "PlotImage",
62
59
  "RawJSONObject",
63
60
  "Schema",
64
- "SourceRecord",
65
61
  "Table",
66
62
  "TextFile",
67
63
  "WebPage",
68
64
  "XLSFile",
69
- # datasources
70
- "DataSource",
71
- "DirectorySource",
72
- "FileSource",
73
- "HTMLFileDirectorySource",
74
- "ImageFileDirectorySource",
75
- "MemorySource",
76
- "PDFFileDirectorySource",
77
- "TextFileDirectorySource",
78
- "UserSource",
79
- "ValidationDataSource",
80
- "XLSFileDirectorySource",
65
+ # datareaders
66
+ "DataReader",
67
+ "DirectoryReader",
68
+ "FileReader",
69
+ "HTMLFileDirectoryReader",
70
+ "ImageFileDirectoryReader",
71
+ "MemoryReader",
72
+ "PDFFileDirectoryReader",
73
+ "TextFileDirectoryReader",
74
+ "XLSFileDirectoryReader",
81
75
  # records
82
76
  "DataRecord",
83
77
  "DataRecordSet",
@@ -124,8 +124,8 @@ class RecordOpStats:
124
124
  # identifier for the parent of this record
125
125
  record_parent_id: str
126
126
 
127
- # idenifier for the source of this record
128
- record_source_id: str
127
+ # idenifier for the source idx of this record
128
+ record_source_idx: str
129
129
 
130
130
  # a dictionary with the record state after being processed by the operator
131
131
  record_state: dict[str, Any]
@@ -406,6 +406,13 @@ class OperatorCostEstimates:
406
406
  # upper bound on quality
407
407
  quality_upper_bound: float | None = None
408
408
 
409
+ def __rmul__(self, multiplier: float) -> OperatorCostEstimates:
410
+ """
411
+ Multiply all fields by a scalar.
412
+ """
413
+ dct = {field.name: getattr(self, field.name) * multiplier for field in fields(self)}
414
+ return OperatorCostEstimates(**dct)
415
+
409
416
  def __post_init__(self):
410
417
  if self.cardinality_lower_bound is None and self.cardinality_upper_bound is None:
411
418
  self.cardinality_lower_bound = self.cardinality