edsl 0.1.55__py3-none-any.whl → 0.1.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__version__.py +1 -1
- edsl/base/data_transfer_models.py +15 -4
- edsl/coop/coop.py +3 -3
- edsl/dataset/dataset_operations_mixin.py +216 -180
- edsl/inference_services/services/google_service.py +5 -2
- edsl/interviews/request_token_estimator.py +8 -0
- edsl/invigilators/invigilators.py +26 -13
- edsl/jobs/jobs_pricing_estimation.py +176 -113
- edsl/language_models/language_model.py +24 -6
- edsl/language_models/price_manager.py +171 -36
- edsl/results/result.py +52 -30
- edsl/scenarios/file_store.py +60 -30
- {edsl-0.1.55.dist-info → edsl-0.1.57.dist-info}/METADATA +2 -2
- {edsl-0.1.55.dist-info → edsl-0.1.57.dist-info}/RECORD +17 -17
- {edsl-0.1.55.dist-info → edsl-0.1.57.dist-info}/LICENSE +0 -0
- {edsl-0.1.55.dist-info → edsl-0.1.57.dist-info}/WHEEL +0 -0
- {edsl-0.1.55.dist-info → edsl-0.1.57.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,22 @@
|
|
1
|
-
from
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Dict, Literal, Tuple, Union
|
3
|
+
from collections import namedtuple
|
4
|
+
|
5
|
+
|
6
|
+
@dataclass
|
7
|
+
class ResponseCost:
|
8
|
+
"""
|
9
|
+
Class for storing the cost and token usage of a language model response.
|
10
|
+
|
11
|
+
If an error occurs when computing the cost, the total_cost will contain a string with the error message.
|
12
|
+
All other fields will be None.
|
13
|
+
"""
|
14
|
+
|
15
|
+
input_tokens: Union[int, None] = None
|
16
|
+
output_tokens: Union[int, None] = None
|
17
|
+
input_price_per_million_tokens: Union[float, None] = None
|
18
|
+
output_price_per_million_tokens: Union[float, None] = None
|
19
|
+
total_cost: Union[float, str, None] = None
|
2
20
|
|
3
21
|
|
4
22
|
class PriceManager:
|
@@ -64,28 +82,127 @@ class PriceManager:
|
|
64
82
|
return self._price_lookup.copy()
|
65
83
|
|
66
84
|
def _get_fallback_price(self, inference_service: str) -> Dict:
|
67
|
-
"""
|
85
|
+
"""
|
86
|
+
Get fallback prices for a service.
|
87
|
+
- First fallback: The highest input and output prices for that service from the price lookup.
|
88
|
+
- Second fallback: $1.00 per million tokens (for both input and output).
|
89
|
+
|
90
|
+
Args:
|
91
|
+
inference_service (str): The inference service name
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
Dict: Price information
|
95
|
+
"""
|
96
|
+
PriceEntry = namedtuple("PriceEntry", ["tokens_per_usd", "price_info"])
|
97
|
+
|
68
98
|
service_prices = [
|
69
99
|
prices
|
70
100
|
for (service, _), prices in self._price_lookup.items()
|
71
101
|
if service == inference_service
|
72
102
|
]
|
73
103
|
|
74
|
-
|
75
|
-
|
104
|
+
default_price_info = {
|
105
|
+
"one_usd_buys": 1_000_000,
|
106
|
+
"service_stated_token_qty": 1_000_000,
|
107
|
+
"service_stated_token_price": 1.0,
|
108
|
+
}
|
109
|
+
|
110
|
+
# Find the most expensive price entries (lowest tokens per USD)
|
111
|
+
input_price_info = default_price_info
|
112
|
+
output_price_info = default_price_info
|
113
|
+
|
114
|
+
input_prices = [
|
115
|
+
PriceEntry(float(p["input"]["one_usd_buys"]), p["input"])
|
116
|
+
for p in service_prices
|
117
|
+
if "input" in p
|
76
118
|
]
|
77
|
-
|
119
|
+
if input_prices:
|
120
|
+
input_price_info = min(
|
121
|
+
input_prices, key=lambda price: price.tokens_per_usd
|
122
|
+
).price_info
|
78
123
|
|
79
|
-
|
80
|
-
float(p["output"]["one_usd_buys"])
|
124
|
+
output_prices = [
|
125
|
+
PriceEntry(float(p["output"]["one_usd_buys"]), p["output"])
|
126
|
+
for p in service_prices
|
127
|
+
if "output" in p
|
81
128
|
]
|
82
|
-
|
129
|
+
if output_prices:
|
130
|
+
output_price_info = min(
|
131
|
+
output_prices, key=lambda price: price.tokens_per_usd
|
132
|
+
).price_info
|
83
133
|
|
84
134
|
return {
|
85
|
-
"input":
|
86
|
-
"output":
|
135
|
+
"input": input_price_info,
|
136
|
+
"output": output_price_info,
|
87
137
|
}
|
88
138
|
|
139
|
+
def get_price_per_million_tokens(
|
140
|
+
self,
|
141
|
+
relevant_prices: Dict,
|
142
|
+
token_type: Literal["input", "output"],
|
143
|
+
) -> Dict:
|
144
|
+
"""
|
145
|
+
Get the price per million tokens for a specific service, model, and token type.
|
146
|
+
"""
|
147
|
+
service_price = relevant_prices[token_type]["service_stated_token_price"]
|
148
|
+
service_qty = relevant_prices[token_type]["service_stated_token_qty"]
|
149
|
+
|
150
|
+
if service_qty == 1_000_000:
|
151
|
+
price_per_million_tokens = service_price
|
152
|
+
elif service_qty == 1_000:
|
153
|
+
price_per_million_tokens = service_price * 1_000
|
154
|
+
else:
|
155
|
+
price_per_token = service_price / service_qty
|
156
|
+
price_per_million_tokens = round(price_per_token * 1_000_000, 10)
|
157
|
+
return price_per_million_tokens
|
158
|
+
|
159
|
+
def _calculate_total_cost(
|
160
|
+
self,
|
161
|
+
relevant_prices: Dict,
|
162
|
+
input_tokens: int,
|
163
|
+
output_tokens: int,
|
164
|
+
) -> float:
|
165
|
+
"""
|
166
|
+
Calculate the total cost for a model usage based on input and output tokens.
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
float: Total cost
|
170
|
+
"""
|
171
|
+
# Extract price information
|
172
|
+
try:
|
173
|
+
inverse_output_price = relevant_prices["output"]["one_usd_buys"]
|
174
|
+
inverse_input_price = relevant_prices["input"]["one_usd_buys"]
|
175
|
+
except Exception as e:
|
176
|
+
if "output" not in relevant_prices:
|
177
|
+
raise KeyError(
|
178
|
+
f"Could not fetch prices from {relevant_prices} - {e}; Missing 'output' key."
|
179
|
+
)
|
180
|
+
if "input" not in relevant_prices:
|
181
|
+
raise KeyError(
|
182
|
+
f"Could not fetch prices from {relevant_prices} - {e}; Missing 'input' key."
|
183
|
+
)
|
184
|
+
raise Exception(f"Could not fetch prices from {relevant_prices} - {e}")
|
185
|
+
|
186
|
+
# Calculate input cost
|
187
|
+
if inverse_input_price == "infinity":
|
188
|
+
input_cost = 0
|
189
|
+
else:
|
190
|
+
try:
|
191
|
+
input_cost = input_tokens / float(inverse_input_price)
|
192
|
+
except Exception as e:
|
193
|
+
raise Exception(f"Could not compute input price - {e}")
|
194
|
+
|
195
|
+
# Calculate output cost
|
196
|
+
if inverse_output_price == "infinity":
|
197
|
+
output_cost = 0
|
198
|
+
else:
|
199
|
+
try:
|
200
|
+
output_cost = output_tokens / float(inverse_output_price)
|
201
|
+
except Exception as e:
|
202
|
+
raise Exception(f"Could not compute output price - {e}")
|
203
|
+
|
204
|
+
return input_cost + output_cost
|
205
|
+
|
89
206
|
def calculate_cost(
|
90
207
|
self,
|
91
208
|
inference_service: str,
|
@@ -93,43 +210,61 @@ class PriceManager:
|
|
93
210
|
usage: Dict[str, Union[str, int]],
|
94
211
|
input_token_name: str,
|
95
212
|
output_token_name: str,
|
96
|
-
) ->
|
97
|
-
"""
|
98
|
-
|
213
|
+
) -> ResponseCost:
|
214
|
+
"""
|
215
|
+
Calculate the cost and token usage for a model response.
|
99
216
|
|
100
|
-
|
217
|
+
Args:
|
218
|
+
inference_service (str): The inference service identifier
|
219
|
+
model (str): The model identifier
|
220
|
+
usage (Dict[str, Union[str, int]]): Dictionary containing token usage information
|
221
|
+
input_token_name (str): Key name for input tokens in the usage dict
|
222
|
+
output_token_name (str): Key name for output tokens in the usage dict
|
223
|
+
|
224
|
+
Returns:
|
225
|
+
ResponseCost: Object containing token counts and total cost
|
226
|
+
"""
|
101
227
|
try:
|
102
228
|
input_tokens = int(usage[input_token_name])
|
103
229
|
output_tokens = int(usage[output_token_name])
|
104
230
|
except Exception as e:
|
105
|
-
return
|
231
|
+
return ResponseCost(
|
232
|
+
total_cost=f"Could not fetch tokens from model response: {e}",
|
233
|
+
)
|
106
234
|
|
107
|
-
# Extract price information
|
108
235
|
try:
|
109
|
-
|
110
|
-
inverse_input_price = relevant_prices["input"]["one_usd_buys"]
|
236
|
+
relevant_prices = self.get_price(inference_service, model)
|
111
237
|
except Exception as e:
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
return f"Could not fetch prices from {relevant_prices} - {e}; Missing 'input' key."
|
116
|
-
return f"Could not fetch prices from {relevant_prices} - {e}"
|
238
|
+
return ResponseCost(
|
239
|
+
total_cost=f"Could not fetch prices from {inference_service} - {model}: {e}",
|
240
|
+
)
|
117
241
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
242
|
+
try:
|
243
|
+
input_price_per_million_tokens = self.get_price_per_million_tokens(
|
244
|
+
relevant_prices, "input"
|
245
|
+
)
|
246
|
+
output_price_per_million_tokens = self.get_price_per_million_tokens(
|
247
|
+
relevant_prices, "output"
|
248
|
+
)
|
249
|
+
except Exception as e:
|
250
|
+
return ResponseCost(
|
251
|
+
total_cost=f"Could not compute price per million tokens: {e}",
|
252
|
+
)
|
124
253
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
254
|
+
try:
|
255
|
+
total_cost = self._calculate_total_cost(
|
256
|
+
relevant_prices, input_tokens, output_tokens
|
257
|
+
)
|
258
|
+
except Exception as e:
|
259
|
+
return ResponseCost(total_cost=f"{e}")
|
131
260
|
|
132
|
-
return
|
261
|
+
return ResponseCost(
|
262
|
+
input_tokens=input_tokens,
|
263
|
+
output_tokens=output_tokens,
|
264
|
+
input_price_per_million_tokens=input_price_per_million_tokens,
|
265
|
+
output_price_per_million_tokens=output_price_per_million_tokens,
|
266
|
+
total_cost=total_cost,
|
267
|
+
)
|
133
268
|
|
134
269
|
@property
|
135
270
|
def is_initialized(self) -> bool:
|
edsl/results/result.py
CHANGED
@@ -20,6 +20,7 @@ The Result class inherits from both Base (for serialization) and UserDict (for
|
|
20
20
|
dictionary-like behavior), allowing it to be accessed like a dictionary while
|
21
21
|
maintaining a rich object model.
|
22
22
|
"""
|
23
|
+
|
23
24
|
from __future__ import annotations
|
24
25
|
import inspect
|
25
26
|
from collections import UserDict
|
@@ -40,6 +41,7 @@ if TYPE_CHECKING:
|
|
40
41
|
QuestionName = str
|
41
42
|
AnswerValue = Any
|
42
43
|
|
44
|
+
|
43
45
|
class AgentNamer:
|
44
46
|
"""Maintains a registry of agent names to ensure unique naming."""
|
45
47
|
|
@@ -61,20 +63,20 @@ agent_namer = AgentNamer().get_name
|
|
61
63
|
class Result(Base, UserDict):
|
62
64
|
"""
|
63
65
|
The Result class captures the complete data from one agent interview.
|
64
|
-
|
66
|
+
|
65
67
|
A Result object stores the agent, scenario, language model, and all answers
|
66
68
|
provided during an interview, along with metadata such as token usage,
|
67
69
|
caching information, and raw model responses. It provides a rich interface
|
68
70
|
for accessing this data and supports serialization for storage and retrieval.
|
69
|
-
|
71
|
+
|
70
72
|
Key features:
|
71
|
-
|
73
|
+
|
72
74
|
- Dictionary-like access to all data through the UserDict interface
|
73
75
|
- Properties for convenient access to common attributes (agent, scenario, model, answer)
|
74
76
|
- Rich data structure with sub-dictionaries for organization
|
75
77
|
- Support for scoring results against reference answers
|
76
78
|
- Serialization to/from dictionaries for storage
|
77
|
-
|
79
|
+
|
78
80
|
Results are typically created by the Jobs system when running interviews and
|
79
81
|
collected into a Results collection for analysis. You rarely need to create
|
80
82
|
Result objects manually.
|
@@ -260,6 +262,7 @@ class Result(Base, UserDict):
|
|
260
262
|
for key in self.problem_keys:
|
261
263
|
if key in expression and key + "." not in expression:
|
262
264
|
from .exceptions import ResultsColumnNotFoundError
|
265
|
+
|
263
266
|
raise ResultsColumnNotFoundError(
|
264
267
|
f"Key by itself {key} is problematic. Use the full key {key + '.' + key} name instead."
|
265
268
|
)
|
@@ -268,6 +271,7 @@ class Result(Base, UserDict):
|
|
268
271
|
def code(self):
|
269
272
|
"""Return a string of code that can be used to recreate the Result object."""
|
270
273
|
from .exceptions import ResultsError
|
274
|
+
|
271
275
|
raise ResultsError("The code() method is not implemented for Result objects")
|
272
276
|
|
273
277
|
@property
|
@@ -316,7 +320,7 @@ class Result(Base, UserDict):
|
|
316
320
|
|
317
321
|
def get_value(self, data_type: str, key: str) -> Any:
|
318
322
|
"""Return the value for a given data type and key.
|
319
|
-
|
323
|
+
|
320
324
|
This method provides a consistent way to access values across different
|
321
325
|
sub-dictionaries in the Result object. It's particularly useful when you
|
322
326
|
need to programmatically access values without knowing which data type
|
@@ -331,7 +335,7 @@ class Result(Base, UserDict):
|
|
331
335
|
|
332
336
|
Returns:
|
333
337
|
The value associated with the key in the specified data type
|
334
|
-
|
338
|
+
|
335
339
|
Examples:
|
336
340
|
>>> r = Result.example()
|
337
341
|
>>> r.get_value("answer", "how_feeling")
|
@@ -344,15 +348,15 @@ class Result(Base, UserDict):
|
|
344
348
|
@property
|
345
349
|
def key_to_data_type(self) -> dict[str, str]:
|
346
350
|
"""A mapping of attribute names to their container data types.
|
347
|
-
|
351
|
+
|
348
352
|
This property returns a dictionary that maps each attribute name (like 'how_feeling')
|
349
353
|
to its containing data type or category (like 'answer'). This is useful for
|
350
354
|
determining which part of the Result object a particular attribute belongs to,
|
351
355
|
especially when working with data programmatically.
|
352
|
-
|
356
|
+
|
353
357
|
If a key name appears in multiple data types, the property will automatically
|
354
358
|
rename the conflicting keys by appending the data type name to avoid ambiguity.
|
355
|
-
|
359
|
+
|
356
360
|
Returns:
|
357
361
|
A dictionary mapping attribute names to their data types
|
358
362
|
|
@@ -435,7 +439,7 @@ class Result(Base, UserDict):
|
|
435
439
|
else prompt_obj.to_dict()
|
436
440
|
)
|
437
441
|
d[key] = new_prompt_dict
|
438
|
-
|
442
|
+
|
439
443
|
if self.indices is not None:
|
440
444
|
d["indices"] = self.indices
|
441
445
|
|
@@ -495,7 +499,7 @@ class Result(Base, UserDict):
|
|
495
499
|
comments_dict=json_dict.get("comments_dict", {}),
|
496
500
|
cache_used_dict=json_dict.get("cache_used_dict", {}),
|
497
501
|
cache_keys=json_dict.get("cache_keys", {}),
|
498
|
-
indices
|
502
|
+
indices=json_dict.get("indices", None),
|
499
503
|
)
|
500
504
|
if "interview_hash" in json_dict:
|
501
505
|
result.interview_hash = json_dict["interview_hash"]
|
@@ -522,14 +526,14 @@ class Result(Base, UserDict):
|
|
522
526
|
from .results import Results
|
523
527
|
|
524
528
|
return Results.example()[0]
|
525
|
-
|
529
|
+
|
526
530
|
def score_with_answer_key(self, answer_key: dict) -> dict[str, int]:
|
527
531
|
"""Score the result against a reference answer key.
|
528
|
-
|
529
|
-
This method evaluates the correctness of answers by comparing them to a
|
530
|
-
provided answer key. It returns a dictionary with counts of correct,
|
532
|
+
|
533
|
+
This method evaluates the correctness of answers by comparing them to a
|
534
|
+
provided answer key. It returns a dictionary with counts of correct,
|
531
535
|
incorrect, and missing answers.
|
532
|
-
|
536
|
+
|
533
537
|
The answer key can contain either single values or lists of acceptable values.
|
534
538
|
If a list is provided, the answer is considered correct if it matches any
|
535
539
|
value in the list.
|
@@ -541,7 +545,7 @@ class Result(Base, UserDict):
|
|
541
545
|
Returns:
|
542
546
|
A dictionary with keys 'correct', 'incorrect', and 'missing', indicating
|
543
547
|
the counts of each answer type.
|
544
|
-
|
548
|
+
|
545
549
|
Examples:
|
546
550
|
>>> Result.example()['answer']
|
547
551
|
{'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
|
@@ -550,21 +554,24 @@ class Result(Base, UserDict):
|
|
550
554
|
>>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
|
551
555
|
>>> Result.example().score_with_answer_key(answer_key)
|
552
556
|
{'correct': 2, 'incorrect': 0, 'missing': 0}
|
553
|
-
|
557
|
+
|
554
558
|
>>> # Using answer key with multiple acceptable answers
|
555
559
|
>>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': ['Great', 'Good']}
|
556
560
|
>>> Result.example().score_with_answer_key(answer_key)
|
557
561
|
{'correct': 2, 'incorrect': 0, 'missing': 0}
|
558
562
|
"""
|
559
|
-
final_scores = {
|
563
|
+
final_scores = {"correct": 0, "incorrect": 0, "missing": 0}
|
560
564
|
for question_name, answer in self.answer.items():
|
561
565
|
if question_name in answer_key:
|
562
|
-
if
|
563
|
-
|
566
|
+
if (
|
567
|
+
answer == answer_key[question_name]
|
568
|
+
or answer in answer_key[question_name]
|
569
|
+
):
|
570
|
+
final_scores["correct"] += 1
|
564
571
|
else:
|
565
|
-
final_scores[
|
572
|
+
final_scores["incorrect"] += 1
|
566
573
|
else:
|
567
|
-
final_scores[
|
574
|
+
final_scores["missing"] += 1
|
568
575
|
|
569
576
|
return final_scores
|
570
577
|
|
@@ -584,6 +591,7 @@ class Result(Base, UserDict):
|
|
584
591
|
params[k] = v.default
|
585
592
|
else:
|
586
593
|
from .exceptions import ResultsError
|
594
|
+
|
587
595
|
raise ResultsError(f"Parameter {k} not found in Result object")
|
588
596
|
return scoring_function(**params)
|
589
597
|
|
@@ -654,16 +662,30 @@ class Result(Base, UserDict):
|
|
654
662
|
raw_model_results_dictionary[question_name + "_raw_model_response"] = (
|
655
663
|
result.raw_model_response
|
656
664
|
)
|
657
|
-
raw_model_results_dictionary[question_name + "
|
658
|
-
|
665
|
+
raw_model_results_dictionary[question_name + "_input_tokens"] = (
|
666
|
+
result.input_tokens
|
667
|
+
)
|
668
|
+
raw_model_results_dictionary[question_name + "_output_tokens"] = (
|
669
|
+
result.output_tokens
|
670
|
+
)
|
671
|
+
raw_model_results_dictionary[
|
672
|
+
question_name + "_input_price_per_million_tokens"
|
673
|
+
] = result.input_price_per_million_tokens
|
674
|
+
raw_model_results_dictionary[
|
675
|
+
question_name + "_output_price_per_million_tokens"
|
676
|
+
] = result.output_price_per_million_tokens
|
677
|
+
raw_model_results_dictionary[question_name + "_cost"] = (
|
678
|
+
result.total_cost
|
679
|
+
)
|
680
|
+
one_usd_buys = (
|
659
681
|
"NA"
|
660
|
-
if isinstance(result.
|
661
|
-
or result.
|
662
|
-
or result.
|
663
|
-
else 1.0 / result.
|
682
|
+
if isinstance(result.total_cost, str)
|
683
|
+
or result.total_cost == 0
|
684
|
+
or result.total_cost is None
|
685
|
+
else 1.0 / result.total_cost
|
664
686
|
)
|
665
687
|
raw_model_results_dictionary[question_name + "_one_usd_buys"] = (
|
666
|
-
|
688
|
+
one_usd_buys
|
667
689
|
)
|
668
690
|
cache_used_dictionary[question_name] = result.cache_used
|
669
691
|
|
edsl/scenarios/file_store.py
CHANGED
@@ -294,10 +294,23 @@ class FileStore(Scenario):
|
|
294
294
|
|
295
295
|
def upload_google(self, refresh: bool = False) -> None:
|
296
296
|
import google.generativeai as genai
|
297
|
+
import google
|
297
298
|
|
298
|
-
|
299
|
-
|
300
|
-
|
299
|
+
try:
|
300
|
+
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
301
|
+
google_info = genai.upload_file(self.path, mime_type=self.mime_type)
|
302
|
+
self.external_locations["google"] = google_info.to_dict()
|
303
|
+
while True:
|
304
|
+
file_metadata = genai.get_file(name=google_info.name)
|
305
|
+
file_state = file_metadata.state
|
306
|
+
|
307
|
+
if file_state == 2: # "ACTIVE":
|
308
|
+
break
|
309
|
+
elif file_state == 10: # "FAILED":
|
310
|
+
break
|
311
|
+
except Exception as e:
|
312
|
+
print(f"Error uploading to Google: {e}")
|
313
|
+
raise
|
301
314
|
|
302
315
|
@classmethod
|
303
316
|
@remove_edsl_version
|
@@ -592,14 +605,14 @@ class FileStore(Scenario):
|
|
592
605
|
"""
|
593
606
|
# Check if the mime type starts with 'image/'
|
594
607
|
return self.mime_type.startswith("image/")
|
595
|
-
|
608
|
+
|
596
609
|
def is_video(self) -> bool:
|
597
610
|
"""
|
598
611
|
Check if the file is a video by examining its MIME type.
|
599
|
-
|
612
|
+
|
600
613
|
Returns:
|
601
614
|
bool: True if the file is a video, False otherwise.
|
602
|
-
|
615
|
+
|
603
616
|
Examples:
|
604
617
|
>>> fs = FileStore.example("mp4")
|
605
618
|
>>> fs.is_video()
|
@@ -613,19 +626,19 @@ class FileStore(Scenario):
|
|
613
626
|
"""
|
614
627
|
# Check if the mime type starts with 'video/'
|
615
628
|
return self.mime_type.startswith("video/")
|
616
|
-
|
629
|
+
|
617
630
|
def get_video_metadata(self) -> dict:
|
618
631
|
"""
|
619
632
|
Get metadata about a video file such as duration, dimensions, codec, etc.
|
620
633
|
Uses FFmpeg to extract the information if available.
|
621
|
-
|
634
|
+
|
622
635
|
Returns:
|
623
636
|
dict: A dictionary containing video metadata, or a dictionary with
|
624
637
|
error information if metadata extraction fails.
|
625
|
-
|
638
|
+
|
626
639
|
Raises:
|
627
640
|
ValueError: If the file is not a video.
|
628
|
-
|
641
|
+
|
629
642
|
Example:
|
630
643
|
>>> fs = FileStore.example("mp4")
|
631
644
|
>>> metadata = fs.get_video_metadata()
|
@@ -634,47 +647,63 @@ class FileStore(Scenario):
|
|
634
647
|
"""
|
635
648
|
if not self.is_video():
|
636
649
|
raise ValueError("This file is not a video")
|
637
|
-
|
650
|
+
|
638
651
|
# We'll try to use ffprobe (part of ffmpeg) to get metadata
|
639
652
|
import subprocess
|
640
653
|
import json
|
641
|
-
|
654
|
+
|
642
655
|
try:
|
643
656
|
# Run ffprobe to get video metadata in JSON format
|
644
657
|
result = subprocess.run(
|
645
658
|
[
|
646
|
-
"ffprobe",
|
647
|
-
"-
|
659
|
+
"ffprobe",
|
660
|
+
"-v",
|
661
|
+
"quiet",
|
662
|
+
"-print_format",
|
663
|
+
"json",
|
664
|
+
"-show_format",
|
665
|
+
"-show_streams",
|
666
|
+
self.path,
|
648
667
|
],
|
649
|
-
capture_output=True,
|
668
|
+
capture_output=True,
|
669
|
+
text=True,
|
670
|
+
check=True,
|
650
671
|
)
|
651
|
-
|
672
|
+
|
652
673
|
# Parse the JSON output
|
653
674
|
metadata = json.loads(result.stdout)
|
654
|
-
|
675
|
+
|
655
676
|
# Extract some common useful fields into a more user-friendly format
|
656
677
|
simplified = {
|
657
678
|
"format": metadata.get("format", {}).get("format_name", "unknown"),
|
658
|
-
"duration_seconds": float(
|
679
|
+
"duration_seconds": float(
|
680
|
+
metadata.get("format", {}).get("duration", 0)
|
681
|
+
),
|
659
682
|
"size_bytes": int(metadata.get("format", {}).get("size", 0)),
|
660
683
|
"bit_rate": int(metadata.get("format", {}).get("bit_rate", 0)),
|
661
684
|
"streams": len(metadata.get("streams", [])),
|
662
685
|
}
|
663
|
-
|
686
|
+
|
664
687
|
# Add video stream info if available
|
665
|
-
video_streams = [
|
688
|
+
video_streams = [
|
689
|
+
s for s in metadata.get("streams", []) if s.get("codec_type") == "video"
|
690
|
+
]
|
666
691
|
if video_streams:
|
667
692
|
video = video_streams[0] # Get the first video stream
|
668
693
|
simplified["video"] = {
|
669
694
|
"codec": video.get("codec_name", "unknown"),
|
670
695
|
"width": video.get("width", 0),
|
671
696
|
"height": video.get("height", 0),
|
672
|
-
"frame_rate": eval(
|
697
|
+
"frame_rate": eval(
|
698
|
+
video.get("r_frame_rate", "0/1")
|
699
|
+
), # Convert "30/1" to 30.0
|
673
700
|
"pixel_format": video.get("pix_fmt", "unknown"),
|
674
701
|
}
|
675
|
-
|
702
|
+
|
676
703
|
# Add audio stream info if available
|
677
|
-
audio_streams = [
|
704
|
+
audio_streams = [
|
705
|
+
s for s in metadata.get("streams", []) if s.get("codec_type") == "audio"
|
706
|
+
]
|
678
707
|
if audio_streams:
|
679
708
|
audio = audio_streams[0] # Get the first audio stream
|
680
709
|
simplified["audio"] = {
|
@@ -682,14 +711,15 @@ class FileStore(Scenario):
|
|
682
711
|
"channels": audio.get("channels", 0),
|
683
712
|
"sample_rate": audio.get("sample_rate", "unknown"),
|
684
713
|
}
|
685
|
-
|
714
|
+
|
686
715
|
# Return both the complete metadata and simplified version
|
687
|
-
return {
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
716
|
+
return {"simplified": simplified, "full": metadata}
|
717
|
+
|
718
|
+
except (
|
719
|
+
subprocess.SubprocessError,
|
720
|
+
FileNotFoundError,
|
721
|
+
json.JSONDecodeError,
|
722
|
+
) as e:
|
693
723
|
# If ffprobe is not available or fails, return basic info
|
694
724
|
return {
|
695
725
|
"error": str(e),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: edsl
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.57
|
4
4
|
Summary: Create and analyze LLM-based surveys
|
5
5
|
Home-page: https://www.expectedparrot.com/
|
6
6
|
License: MIT
|
@@ -23,7 +23,7 @@ Requires-Dist: azure-ai-inference (>=1.0.0b3,<2.0.0)
|
|
23
23
|
Requires-Dist: black[jupyter] (>=24.4.2,<25.0.0)
|
24
24
|
Requires-Dist: boto3 (>=1.34.161,<2.0.0)
|
25
25
|
Requires-Dist: google-generativeai (>=0.8.2,<0.9.0)
|
26
|
-
Requires-Dist: groq (
|
26
|
+
Requires-Dist: groq (==0.23.1)
|
27
27
|
Requires-Dist: jinja2 (>=3.1.2,<4.0.0)
|
28
28
|
Requires-Dist: json-repair (>=0.28.4,<0.29.0)
|
29
29
|
Requires-Dist: jupyter (>=1.0.0,<2.0.0)
|