edsl 0.1.52__py3-none-any.whl → 0.1.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__version__.py +1 -1
- edsl/interviews/request_token_estimator.py +104 -2
- edsl/invigilators/invigilators.py +5 -0
- edsl/scenarios/file_store.py +73 -23
- {edsl-0.1.52.dist-info → edsl-0.1.53.dist-info}/METADATA +1 -1
- {edsl-0.1.52.dist-info → edsl-0.1.53.dist-info}/RECORD +9 -9
- {edsl-0.1.52.dist-info → edsl-0.1.53.dist-info}/LICENSE +0 -0
- {edsl-0.1.52.dist-info → edsl-0.1.53.dist-info}/WHEEL +0 -0
- {edsl-0.1.52.dist-info → edsl-0.1.53.dist-info}/entry_points.txt +0 -0
edsl/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.53"
|
@@ -1,6 +1,101 @@
|
|
1
1
|
from ..jobs.fetch_invigilator import FetchInvigilator
|
2
2
|
from ..scenarios import FileStore
|
3
3
|
|
4
|
+
import math
|
5
|
+
|
6
|
+
# Model configs: base tokens and tile tokens only
|
7
|
+
VISION_MODELS = {
|
8
|
+
"gpt-4o": {
|
9
|
+
"base_tokens": 85,
|
10
|
+
"tile_tokens": 170,
|
11
|
+
},
|
12
|
+
"gpt-4o-mini": {
|
13
|
+
"base_tokens": 2833,
|
14
|
+
"tile_tokens": 5667,
|
15
|
+
},
|
16
|
+
"o1": {
|
17
|
+
"base_tokens": 75,
|
18
|
+
"tile_tokens": 150,
|
19
|
+
},
|
20
|
+
}
|
21
|
+
|
22
|
+
|
23
|
+
def approximate_image_tokens_google(width: int, height: int) -> int:
|
24
|
+
"""
|
25
|
+
Approximates the token usage for an image based on its dimensions.
|
26
|
+
|
27
|
+
This calculation is based on the rules described for Gemini 2.0 models
|
28
|
+
in the provided text:
|
29
|
+
- Images with both dimensions <= 384px cost 258 tokens.
|
30
|
+
- Larger images are processed in 768x768 tiles, each costing 258 tokens.
|
31
|
+
|
32
|
+
Note: This is an *approximation*. The exact cropping, scaling, and tiling
|
33
|
+
strategy used by the actual Gemini API might differ slightly.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
width: The width of the image in pixels.
|
37
|
+
height: The height of the image in pixels.
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
An estimated integer token count for the image.
|
41
|
+
|
42
|
+
Raises:
|
43
|
+
ValueError: If width or height are not positive integers.
|
44
|
+
"""
|
45
|
+
SMALL_IMAGE_THRESHOLD = 384 # Max dimension for fixed token count
|
46
|
+
FIXED_TOKEN_COST_SMALL = 258 # Token cost for small images (<= 384x384)
|
47
|
+
TILE_SIZE = 768 # Dimension of tiles for larger images
|
48
|
+
TOKEN_COST_PER_TILE = 258 # Token cost per 768x768 tile
|
49
|
+
if (
|
50
|
+
not isinstance(width, int)
|
51
|
+
or not isinstance(height, int)
|
52
|
+
or width <= 0
|
53
|
+
or height <= 0
|
54
|
+
):
|
55
|
+
raise ValueError("Image width and height must be positive integers.")
|
56
|
+
|
57
|
+
# Case 1: Small image (both dimensions <= threshold)
|
58
|
+
if width <= SMALL_IMAGE_THRESHOLD and height <= SMALL_IMAGE_THRESHOLD:
|
59
|
+
return FIXED_TOKEN_COST_SMALL
|
60
|
+
|
61
|
+
# Case 2: Larger image (at least one dimension > threshold)
|
62
|
+
else:
|
63
|
+
# Calculate how many tiles are needed to cover the width and height
|
64
|
+
# Use ceiling division to ensure full coverage
|
65
|
+
tiles_wide = math.ceil(width / TILE_SIZE)
|
66
|
+
tiles_high = math.ceil(height / TILE_SIZE)
|
67
|
+
|
68
|
+
# Total number of tiles is the product of tiles needed in each dimension
|
69
|
+
total_tiles = tiles_wide * tiles_high
|
70
|
+
|
71
|
+
# Total token cost is the number of tiles times the cost per tile
|
72
|
+
estimated_tokens = total_tiles * TOKEN_COST_PER_TILE
|
73
|
+
return estimated_tokens
|
74
|
+
|
75
|
+
|
76
|
+
def estimate_tokens(model_name, width, height):
|
77
|
+
if model_name == "test":
|
78
|
+
return 10 # for testing purposes
|
79
|
+
if "gemini" in model_name:
|
80
|
+
out = approximate_image_tokens_google(width, height)
|
81
|
+
return out
|
82
|
+
if "claude" in model_name:
|
83
|
+
total_tokens = width * height / 750
|
84
|
+
return total_tokens
|
85
|
+
if model_name not in VISION_MODELS:
|
86
|
+
total_tokens = width * height / 750
|
87
|
+
return total_tokens
|
88
|
+
|
89
|
+
config = VISION_MODELS[model_name]
|
90
|
+
TILE_SIZE = 512
|
91
|
+
|
92
|
+
tiles_x = math.ceil(width / TILE_SIZE)
|
93
|
+
tiles_y = math.ceil(height / TILE_SIZE)
|
94
|
+
total_tiles = tiles_x * tiles_y
|
95
|
+
|
96
|
+
total_tokens = config["base_tokens"] + config["tile_tokens"] * total_tiles
|
97
|
+
return total_tokens
|
98
|
+
|
4
99
|
|
5
100
|
class RequestTokenEstimator:
|
6
101
|
"""Estimate the number of tokens that will be required to run the focal task."""
|
@@ -24,15 +119,22 @@ class RequestTokenEstimator:
|
|
24
119
|
elif isinstance(prompt, list):
|
25
120
|
for file in prompt:
|
26
121
|
if isinstance(file, FileStore):
|
27
|
-
|
122
|
+
if file.is_image():
|
123
|
+
model_name = self.interview.model.model
|
124
|
+
width, height = file.get_image_dimensions()
|
125
|
+
token_usage = estimate_tokens(model_name, width, height)
|
126
|
+
file_tokens += token_usage
|
127
|
+
else:
|
128
|
+
file_tokens += file.size * 0.25
|
28
129
|
else:
|
29
130
|
from .exceptions import InterviewTokenError
|
131
|
+
|
30
132
|
raise InterviewTokenError(f"Prompt is of type {type(prompt)}")
|
31
133
|
result: float = len(combined_text) / 4.0 + file_tokens
|
32
134
|
return result
|
33
135
|
|
34
136
|
|
35
|
-
|
36
137
|
if __name__ == "__main__":
|
37
138
|
import doctest
|
139
|
+
|
38
140
|
doctest.testmod(optionflags=doctest.ELLIPSIS)
|
@@ -397,6 +397,11 @@ class InvigilatorAI(InvigilatorBase):
|
|
397
397
|
data = {
|
398
398
|
"answer": agent_response_dict.edsl_dict.answer
|
399
399
|
if type(agent_response_dict.edsl_dict.answer) is str
|
400
|
+
or type(agent_response_dict.edsl_dict.answer) is dict
|
401
|
+
or type(agent_response_dict.edsl_dict.answer) is list
|
402
|
+
or type(agent_response_dict.edsl_dict.answer) is int
|
403
|
+
or type(agent_response_dict.edsl_dict.answer) is float
|
404
|
+
or type(agent_response_dict.edsl_dict.answer) is bool
|
400
405
|
else "",
|
401
406
|
"comment": agent_response_dict.edsl_dict.comment
|
402
407
|
if agent_response_dict.edsl_dict.comment
|
edsl/scenarios/file_store.py
CHANGED
@@ -17,25 +17,26 @@ from .file_methods import FileMethods
|
|
17
17
|
if TYPE_CHECKING:
|
18
18
|
from .scenario_list import ScenarioList
|
19
19
|
|
20
|
+
|
20
21
|
class FileStore(Scenario):
|
21
22
|
"""
|
22
23
|
A specialized Scenario subclass for managing file content and metadata.
|
23
|
-
|
24
|
+
|
24
25
|
FileStore provides functionality for working with files in EDSL, handling various
|
25
26
|
file formats with appropriate encoding, storage, and access methods. It extends
|
26
27
|
Scenario to allow files to be included in surveys, questions, and other EDSL components.
|
27
|
-
|
28
|
+
|
28
29
|
FileStore supports multiple file formats including text, PDF, Word documents, images,
|
29
30
|
and more. It can load files from local paths or URLs, and provides methods for
|
30
31
|
accessing file content, extracting text, and managing file operations.
|
31
|
-
|
32
|
+
|
32
33
|
Key features:
|
33
34
|
- Base64 encoding for portability and serialization
|
34
35
|
- Lazy loading through temporary files when needed
|
35
36
|
- Automatic MIME type detection
|
36
37
|
- Text extraction from various file formats
|
37
38
|
- Format-specific operations through specialized handlers
|
38
|
-
|
39
|
+
|
39
40
|
Attributes:
|
40
41
|
_path (str): The original file path.
|
41
42
|
_temp_path (str): Path to any generated temporary file.
|
@@ -45,7 +46,7 @@ class FileStore(Scenario):
|
|
45
46
|
base64_string (str): Base64-encoded file content.
|
46
47
|
external_locations (dict): Dictionary of external locations.
|
47
48
|
extracted_text (str): Text extracted from the file.
|
48
|
-
|
49
|
+
|
49
50
|
Examples:
|
50
51
|
>>> import tempfile
|
51
52
|
>>> # Create a text file
|
@@ -53,13 +54,14 @@ class FileStore(Scenario):
|
|
53
54
|
... _ = f.write("Hello World")
|
54
55
|
... _ = f.flush()
|
55
56
|
... fs = FileStore(f.name)
|
56
|
-
|
57
|
+
|
57
58
|
# The following example works locally but is commented out for CI environments
|
58
59
|
# where dependencies like pandoc may not be available:
|
59
60
|
# >>> # FileStore supports various formats
|
60
61
|
# >>> formats = ["txt", "pdf", "docx", "pptx", "md", "py", "json", "csv", "html", "png", "db"]
|
61
62
|
# >>> _ = [FileStore.example(format) for format in formats]
|
62
63
|
"""
|
64
|
+
|
63
65
|
__documentation__ = "https://docs.expectedparrot.com/en/latest/filestore.html"
|
64
66
|
|
65
67
|
def __init__(
|
@@ -75,11 +77,11 @@ class FileStore(Scenario):
|
|
75
77
|
):
|
76
78
|
"""
|
77
79
|
Initialize a new FileStore object.
|
78
|
-
|
80
|
+
|
79
81
|
This constructor creates a FileStore object from either a file path or a base64-encoded
|
80
82
|
string representation of file content. It handles automatic detection of file properties
|
81
83
|
like MIME type, extracts text content when possible, and manages file encoding.
|
82
|
-
|
84
|
+
|
83
85
|
Args:
|
84
86
|
path: Path to the file to load. Can be a local file path or URL.
|
85
87
|
mime_type: MIME type of the file. If not provided, will be auto-detected.
|
@@ -93,7 +95,7 @@ class FileStore(Scenario):
|
|
93
95
|
text will be extracted automatically if possible.
|
94
96
|
**kwargs: Additional keyword arguments. 'filename' can be used as an
|
95
97
|
alternative to 'path'.
|
96
|
-
|
98
|
+
|
97
99
|
Note:
|
98
100
|
If path is a URL (starts with http:// or https://), the file will be
|
99
101
|
downloaded automatically.
|
@@ -138,15 +140,15 @@ class FileStore(Scenario):
|
|
138
140
|
def path(self) -> str:
|
139
141
|
"""
|
140
142
|
Returns a valid path to the file content, creating a temporary file if needed.
|
141
|
-
|
143
|
+
|
142
144
|
This property ensures that a valid file path is always available for the file
|
143
145
|
content, even if the original file is no longer accessible or if the FileStore
|
144
146
|
was created from a base64 string without a path. If the original path doesn't
|
145
147
|
exist, it automatically generates a temporary file from the base64 content.
|
146
|
-
|
148
|
+
|
147
149
|
Returns:
|
148
150
|
A string containing a valid file path to access the file content.
|
149
|
-
|
151
|
+
|
150
152
|
Examples:
|
151
153
|
>>> import tempfile, os
|
152
154
|
>>> with tempfile.NamedTemporaryFile(suffix=".txt", mode="w") as f:
|
@@ -155,8 +157,8 @@ class FileStore(Scenario):
|
|
155
157
|
... fs = FileStore(f.name)
|
156
158
|
... os.path.isfile(fs.path)
|
157
159
|
True
|
158
|
-
|
159
|
-
|
160
|
+
|
161
|
+
|
160
162
|
Notes:
|
161
163
|
- The path may point to a temporary file that will be cleaned up when the
|
162
164
|
Python process exits
|
@@ -319,9 +321,10 @@ class FileStore(Scenario):
|
|
319
321
|
|
320
322
|
link = ConstructDownloadLink(self).html_create_link(self.path, style=None)
|
321
323
|
return f"{parent_html}<br>{link}"
|
322
|
-
|
324
|
+
|
323
325
|
def download_link(self):
|
324
326
|
from .construct_download_link import ConstructDownloadLink
|
327
|
+
|
325
328
|
return ConstructDownloadLink(self).html_create_link(self.path, style=None)
|
326
329
|
|
327
330
|
def encode_file_to_base64_string(self, file_path: str):
|
@@ -572,6 +575,53 @@ class FileStore(Scenario):
|
|
572
575
|
f"Converting {self.suffix} files to pandas DataFrame is not supported"
|
573
576
|
)
|
574
577
|
|
578
|
+
def is_image(self) -> bool:
|
579
|
+
"""
|
580
|
+
Check if the file is an image by examining its MIME type.
|
581
|
+
|
582
|
+
Returns:
|
583
|
+
bool: True if the file is an image, False otherwise.
|
584
|
+
|
585
|
+
Examples:
|
586
|
+
>>> fs = FileStore.example("png")
|
587
|
+
>>> fs.is_image()
|
588
|
+
True
|
589
|
+
>>> fs = FileStore.example("txt")
|
590
|
+
>>> fs.is_image()
|
591
|
+
False
|
592
|
+
"""
|
593
|
+
# Check if the mime type starts with 'image/'
|
594
|
+
return self.mime_type.startswith("image/")
|
595
|
+
|
596
|
+
def get_image_dimensions(self) -> tuple:
|
597
|
+
"""
|
598
|
+
Get the dimensions (width, height) of an image file.
|
599
|
+
|
600
|
+
Returns:
|
601
|
+
tuple: A tuple containing the width and height of the image.
|
602
|
+
|
603
|
+
Raises:
|
604
|
+
ValueError: If the file is not an image or PIL is not installed.
|
605
|
+
|
606
|
+
Examples:
|
607
|
+
>>> fs = FileStore.example("png")
|
608
|
+
>>> width, height = fs.get_image_dimensions()
|
609
|
+
>>> isinstance(width, int) and isinstance(height, int)
|
610
|
+
True
|
611
|
+
"""
|
612
|
+
if not self.is_image():
|
613
|
+
raise ValueError("This file is not an image")
|
614
|
+
|
615
|
+
try:
|
616
|
+
from PIL import Image
|
617
|
+
except ImportError:
|
618
|
+
raise ImportError(
|
619
|
+
"PIL (Pillow) is required to get image dimensions. Install it with: pip install pillow"
|
620
|
+
)
|
621
|
+
|
622
|
+
with Image.open(self.path) as img:
|
623
|
+
return img.size # Returns (width, height)
|
624
|
+
|
575
625
|
def __getattr__(self, name):
|
576
626
|
"""
|
577
627
|
Delegate pandas DataFrame methods to the underlying DataFrame if this is a CSV file
|
@@ -662,13 +712,13 @@ class FileStore(Scenario):
|
|
662
712
|
# endobj
|
663
713
|
# xref
|
664
714
|
# 0 7
|
665
|
-
# 0000000000 65535 f
|
666
|
-
# 0000000010 00000 n
|
667
|
-
# 0000000053 00000 n
|
668
|
-
# 0000000100 00000 n
|
669
|
-
# 0000000173 00000 n
|
670
|
-
# 0000000232 00000 n
|
671
|
-
# 0000000272 00000 n
|
715
|
+
# 0000000000 65535 f
|
716
|
+
# 0000000010 00000 n
|
717
|
+
# 0000000053 00000 n
|
718
|
+
# 0000000100 00000 n
|
719
|
+
# 0000000173 00000 n
|
720
|
+
# 0000000232 00000 n
|
721
|
+
# 0000000272 00000 n
|
672
722
|
# trailer
|
673
723
|
# << /Size 7 /Root 1 0 R >>
|
674
724
|
# startxref
|
@@ -748,6 +798,7 @@ class FileStore(Scenario):
|
|
748
798
|
|
749
799
|
if __name__ == "__main__":
|
750
800
|
import doctest
|
801
|
+
|
751
802
|
doctest.testmod()
|
752
803
|
|
753
804
|
# formats = FileMethods.supported_file_types()
|
@@ -756,4 +807,3 @@ if __name__ == "__main__":
|
|
756
807
|
# fs = FileStore.example(file_type)
|
757
808
|
# fs.view()
|
758
809
|
# input("Press Enter to continue...")
|
759
|
-
|
@@ -1,5 +1,5 @@
|
|
1
1
|
edsl/__init__.py,sha256=SXi_Zm4kf6H2WW_YeTuF6zRNZEWKzpKa7NRXUzn2Ty4,4593
|
2
|
-
edsl/__version__.py,sha256=
|
2
|
+
edsl/__version__.py,sha256=FdQ5_-vfyHXSNAuzQXtxxDH2WjNJ3g581mH_zvly6Xo,23
|
3
3
|
edsl/agents/__init__.py,sha256=AyhfXjygRHT1Pd9w16lcu5Bu0jnBmMPz86aKP1uRL3Y,93
|
4
4
|
edsl/agents/agent.py,sha256=svTVvvg9eCMUhnb49Bxsf9nAwXragtRaeBkyB6q89EE,54423
|
5
5
|
edsl/agents/agent_list.py,sha256=JA39_6RSmiD2mqJgWr2NWovNxNmu4mhZbYmn5be87NQ,21572
|
@@ -107,12 +107,12 @@ edsl/interviews/interview_status_dictionary.py,sha256=0ZvXLusfOA8xD_Fco4PjEBGwmR
|
|
107
107
|
edsl/interviews/interview_status_enum.py,sha256=KJ-1yLAHdX-p8TiFnM0M3v1tnBwkq4aMCuBX6-ytrI8,229
|
108
108
|
edsl/interviews/interview_status_log.py,sha256=sRiQ9kIT1WcF-8beETn6E7IsdRRrfbco-yjdAjkXncw,3587
|
109
109
|
edsl/interviews/interview_task_manager.py,sha256=wPi5izhsVK5wI5HfMXMLL5NIoucHNCoGXfRuRzI-wYE,3665
|
110
|
-
edsl/interviews/request_token_estimator.py,sha256=
|
110
|
+
edsl/interviews/request_token_estimator.py,sha256=n_C-alSYOFi27cBcIRhtBX-fvklDcvM2Kowte-EDnzM,4833
|
111
111
|
edsl/interviews/statistics.py,sha256=lZCtq79QrDKG3jXao_OWuBRhnly9VyuhM6IdTJaYqPg,2461
|
112
112
|
edsl/invigilators/__init__.py,sha256=fKbZ7p9-kMelpvET3Ku2Owu-tL_apC-8gi9JychpMBY,1843
|
113
113
|
edsl/invigilators/exceptions.py,sha256=ejoF-Gt-YcnW1yHyfpJ3jZm8AC_zD0GCYafRO2LlAMQ,2767
|
114
114
|
edsl/invigilators/invigilator_base.py,sha256=DgrXTK4AAxXr4wg2pzc0p1aGPPf1UUt01C-JW1UBTvo,20099
|
115
|
-
edsl/invigilators/invigilators.py,sha256=
|
115
|
+
edsl/invigilators/invigilators.py,sha256=UH8gy59qq0_f9jzumDbdugF0SvGW_eIr2GT5zCUO8V0,22355
|
116
116
|
edsl/invigilators/prompt_constructor.py,sha256=THHGcZPI-QUOH8Z9cQEzH7bZEoo0V_Nc_Phlhc9AzL0,19115
|
117
117
|
edsl/invigilators/prompt_helpers.py,sha256=LuMZFZkInPY8M7Rw9fG9rpJIcT89tr2_Iq10ZHH_Y4A,5409
|
118
118
|
edsl/invigilators/question_instructions_prompt_builder.py,sha256=E5zpwctpt_5JjONkZRcMwB0MACAzDvvnzUhmuWTnjd0,9684
|
@@ -276,7 +276,7 @@ edsl/scenarios/directory_scanner.py,sha256=gnDXU1jKSjSE3LXEhE7ilfJUL_sxK2HHmsA2L
|
|
276
276
|
edsl/scenarios/document_chunker.py,sha256=EpB0V0oxLzpKntl00Qa3VZNPS7sg9aXdYyqKxhFFzTM,7680
|
277
277
|
edsl/scenarios/exceptions.py,sha256=FeORBm90UthKHDp7cE8I7KJgyA3-pFKNpoivZRr8ifc,10636
|
278
278
|
edsl/scenarios/file_methods.py,sha256=cB_IPVTGz4_yJiRMTdNTvpW4l43lrTbyJOV3Pnm6UPs,2631
|
279
|
-
edsl/scenarios/file_store.py,sha256=
|
279
|
+
edsl/scenarios/file_store.py,sha256=slqSIENW6SP1dhnXTviq4umlvGHeYsDB3SM24t0ll_I,28033
|
280
280
|
edsl/scenarios/handlers/__init__.py,sha256=9r1fDjUviGXso9h4d05wG9RECfqzfps55CQgb-ojCBo,848
|
281
281
|
edsl/scenarios/handlers/csv_file_store.py,sha256=kXOms0ph5JJj6jSbpfQ-SZjuT4vvSRhq5AGpv1L4TPQ,1369
|
282
282
|
edsl/scenarios/handlers/docx_file_store.py,sha256=KSKAAUIWF2K5xr92nx7UGQ9djgtDX4ke-Eyik8QAdlQ,2155
|
@@ -358,8 +358,8 @@ edsl/utilities/repair_functions.py,sha256=EXkXsqnmgPqj9b3dff1cZnJyaZw-qEvGENXCRH
|
|
358
358
|
edsl/utilities/restricted_python.py,sha256=248N2p5EWHDSpcK1G-q7DUoJeWy4sB6aO-RV0-5O7uY,2038
|
359
359
|
edsl/utilities/template_loader.py,sha256=SCAcnTnxNQ67MNSkmfz7F-S_u2peyGn2j1oRIqi1wfg,870
|
360
360
|
edsl/utilities/utilities.py,sha256=irHheAGOnl_6RwI--Hi9StVzvsHcWCqB48PWsWJQYOw,12045
|
361
|
-
edsl-0.1.
|
362
|
-
edsl-0.1.
|
363
|
-
edsl-0.1.
|
364
|
-
edsl-0.1.
|
365
|
-
edsl-0.1.
|
361
|
+
edsl-0.1.53.dist-info/LICENSE,sha256=_qszBDs8KHShVYcYzdMz3HNMtH-fKN_p5zjoVAVumFc,1111
|
362
|
+
edsl-0.1.53.dist-info/METADATA,sha256=WUkrcqMnf8uKQvTyrQTx0q3oVFDurmcI3HkXz9Ljk6g,12670
|
363
|
+
edsl-0.1.53.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
364
|
+
edsl-0.1.53.dist-info/entry_points.txt,sha256=JnG7xqMtHaQu9BU-yPATxdyCeA48XJpuclnWCqMfIMU,38
|
365
|
+
edsl-0.1.53.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|