retab 0.0.40__py3-none-any.whl → 0.0.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. retab/client.py +5 -5
  2. retab/resources/consensus/completions.py +1 -1
  3. retab/resources/consensus/completions_stream.py +5 -5
  4. retab/resources/consensus/responses.py +1 -1
  5. retab/resources/consensus/responses_stream.py +2 -2
  6. retab/resources/documents/client.py +12 -11
  7. retab/resources/documents/extractions.py +4 -4
  8. retab/resources/evals.py +1 -1
  9. retab/resources/evaluations/documents.py +1 -1
  10. retab/resources/jsonlUtils.py +4 -4
  11. retab/resources/processors/automations/endpoints.py +9 -5
  12. retab/resources/processors/automations/links.py +2 -2
  13. retab/resources/processors/automations/logs.py +2 -2
  14. retab/resources/processors/automations/mailboxes.py +43 -32
  15. retab/resources/processors/automations/outlook.py +25 -7
  16. retab/resources/processors/automations/tests.py +8 -2
  17. retab/resources/processors/client.py +25 -16
  18. retab/resources/prompt_optimization.py +1 -1
  19. retab/resources/schemas.py +3 -3
  20. retab/types/automations/mailboxes.py +1 -1
  21. retab/types/completions.py +1 -1
  22. retab/types/documents/create_messages.py +4 -4
  23. retab/types/documents/extractions.py +3 -3
  24. retab/types/documents/parse.py +3 -1
  25. retab/types/evals.py +2 -2
  26. retab/types/evaluations/iterations.py +2 -2
  27. retab/types/evaluations/model.py +2 -2
  28. retab/types/extractions.py +34 -9
  29. retab/types/jobs/prompt_optimization.py +1 -1
  30. retab/types/logs.py +3 -3
  31. retab/types/schemas/object.py +4 -4
  32. retab/types/schemas/templates.py +1 -1
  33. retab/utils/__init__.py +0 -0
  34. retab/utils/_model_cards/anthropic.yaml +59 -0
  35. retab/utils/_model_cards/auto.yaml +43 -0
  36. retab/utils/_model_cards/gemini.yaml +117 -0
  37. retab/utils/_model_cards/openai.yaml +301 -0
  38. retab/utils/_model_cards/xai.yaml +28 -0
  39. retab/utils/ai_models.py +138 -0
  40. retab/utils/benchmarking.py +484 -0
  41. retab/utils/chat.py +327 -0
  42. retab/utils/display.py +440 -0
  43. retab/utils/json_schema.py +2156 -0
  44. retab/utils/mime.py +165 -0
  45. retab/utils/responses.py +169 -0
  46. retab/utils/stream_context_managers.py +52 -0
  47. retab/utils/usage/__init__.py +0 -0
  48. retab/utils/usage/usage.py +301 -0
  49. retab-0.0.42.dist-info/METADATA +119 -0
  50. {retab-0.0.40.dist-info → retab-0.0.42.dist-info}/RECORD +52 -36
  51. retab-0.0.40.dist-info/METADATA +0 -418
  52. {retab-0.0.40.dist-info → retab-0.0.42.dist-info}/WHEEL +0 -0
  53. {retab-0.0.40.dist-info → retab-0.0.42.dist-info}/top_level.txt +0 -0
retab/utils/display.py ADDED
@@ -0,0 +1,440 @@
1
+ import base64
2
+ import json
3
+ from io import BytesIO
4
+ from math import ceil
5
+ from pathlib import Path
6
+ from typing import List, Literal, Optional, TypedDict
7
+
8
+ import numpy as np
9
+ import requests
10
+ import tiktoken # For text tokenization
11
+ from PIL import Image
12
+ from rich.console import Console
13
+ from rich.table import Table
14
+
15
+
16
+ class TokenStats(TypedDict):
17
+ min: float
18
+ max: float
19
+ mean: float
20
+ median: float
21
+ p5: float
22
+ p95: float
23
+
24
+
25
+ class TokenCounts(TypedDict):
26
+ input_text_tokens: int
27
+ output_text_tokens: int
28
+ input_image_tokens: int
29
+ output_image_tokens: int
30
+
31
+
32
+ class MetricCategory(TypedDict):
33
+ num_examples: int
34
+ total_tokens: TokenStats
35
+ input_tokens: TokenStats
36
+ output_tokens: TokenStats
37
+ sum_total_tokens: float
38
+ sum_input_tokens: float
39
+ sum_output_tokens: float
40
+ num_examples_over_token_limit: int
41
+
42
+
43
+ class Metrics(TypedDict):
44
+ Text: MetricCategory
45
+ Image: MetricCategory
46
+ Total: MetricCategory
47
+
48
+
49
+ def count_text_tokens(content: str, encoding_name: str = "cl100k_base") -> int:
50
+ """
51
+ Count the number of tokens in a given text content using the specified encoding.
52
+ """
53
+ enc = tiktoken.get_encoding(encoding_name)
54
+ return len(enc.encode(content))
55
+
56
+
57
+ def count_image_tokens(image_url: str, detail: Literal["low", "high", "auto"] = "high") -> int:
58
+ base_token_cost = 85 # cost for all images
59
+ token_per_tile = 170 # cost per 512×512 tile in high detail
60
+
61
+ # 1. Decide detail=low or detail=high
62
+ # If detail=auto, figure out from user input or some heuristic
63
+ if detail == "low":
64
+ # 2. Low detail => always 85 tokens
65
+ return base_token_cost
66
+ else:
67
+ assert detail == "high" or detail == "auto"
68
+ # 3. High detail => 2-step scaling + tile-based cost
69
+
70
+ # (a) Get the raw image dimensions
71
+ try:
72
+ if image_url.startswith("data:image"):
73
+ header, encoded_data = image_url.split(",", 1)
74
+ image_data = base64.b64decode(encoded_data)
75
+ img = Image.open(BytesIO(image_data))
76
+ else:
77
+ # HTTP URL or local path
78
+ response = requests.get(image_url, timeout=5)
79
+ response.raise_for_status()
80
+ img = Image.open(BytesIO(response.content))
81
+
82
+ width, height = img.size
83
+ except Exception:
84
+ # If we fail to decode or fetch, maybe return the base cost
85
+ # plus one tile as a fallback
86
+ return base_token_cost + token_per_tile
87
+
88
+ # (b) Scale so neither dimension exceeds 2048
89
+ max_side = max(width, height)
90
+ if max_side > 2048:
91
+ scale_factor = 2048.0 / max_side
92
+ width = int(width * scale_factor)
93
+ height = int(height * scale_factor)
94
+
95
+ # (c) Upscale if shortest side < 768
96
+ min_side = min(width, height)
97
+ if min_side < 768:
98
+ upscale_factor = 768.0 / min_side
99
+ width = int(width * upscale_factor)
100
+ height = int(height * upscale_factor)
101
+
102
+ # (d) Count 512×512 tiles in the scaled image
103
+ tiles_wide = ceil(width / 512)
104
+ tiles_high = ceil(height / 512)
105
+ total_tiles = tiles_wide * tiles_high
106
+
107
+ return base_token_cost + (token_per_tile * total_tiles)
108
+
109
+
110
+ def process_jsonl_file(jsonl_path: str) -> List[TokenCounts]:
111
+ """
112
+ Process a JSONL file and calculate the text and image tokens for each example.
113
+ Returns a list of dictionaries with token counts for system, user, assistant, and images.
114
+ """
115
+ results = []
116
+
117
+ with open(jsonl_path, "r", encoding="utf-8") as file:
118
+ for line in file:
119
+ example = json.loads(line)
120
+ input_text_tokens = 0
121
+ output_text_tokens = 0
122
+ input_image_tokens = 0
123
+ output_image_tokens = 0
124
+
125
+ for message in example.get("messages", []):
126
+ role = message.get("role")
127
+ content = message.get("content")
128
+
129
+ if isinstance(content, str):
130
+ # Count text tokens based on role
131
+ if role in ["developer", "system", "user"]:
132
+ input_text_tokens += count_text_tokens(content)
133
+ elif role == "assistant":
134
+ output_text_tokens += count_text_tokens(content)
135
+
136
+ elif isinstance(content, list): # Check for images in content
137
+ for item in content:
138
+ if item.get("type") == "image_url" and "image_url" in item:
139
+ image_url = item["image_url"]["url"]
140
+ tokens = count_image_tokens(image_url)
141
+ if role in ["developer", "system", "user"]:
142
+ input_image_tokens += tokens
143
+ elif role == "assistant":
144
+ output_image_tokens += tokens
145
+
146
+ results.append(
147
+ TokenCounts(
148
+ input_text_tokens=input_text_tokens, output_text_tokens=output_text_tokens, input_image_tokens=input_image_tokens, output_image_tokens=output_image_tokens
149
+ )
150
+ )
151
+
152
+ return results
153
+
154
+
155
+ def calculate_statistics(data: List[int]) -> TokenStats:
156
+ """
157
+ Calculate statistics for a list of numbers.
158
+ """
159
+ if not data:
160
+ return {"min": 0, "max": 0, "mean": 0, "median": 0, "p5": 0, "p95": 0}
161
+
162
+ return {
163
+ "min": float(min(data)),
164
+ "max": float(max(data)),
165
+ "mean": float(np.mean(data)),
166
+ "median": float(np.median(data)),
167
+ "p5": float(np.percentile(data, 5)),
168
+ "p95": float(np.percentile(data, 95)),
169
+ }
170
+
171
+
172
+ def process_dataset_and_compute_metrics(jsonl_path: Path | str, token_limit: int = 128000) -> Metrics:
173
+ """
174
+ Process the dataset to compute metrics for Text, Image, and Total tokens.
175
+ """
176
+ # Initialize metrics
177
+ metrics: Metrics = {
178
+ "Text": MetricCategory(
179
+ num_examples=0,
180
+ total_tokens=TokenStats(min=0, max=0, mean=0, median=0, p5=0, p95=0),
181
+ input_tokens=TokenStats(min=0, max=0, mean=0, median=0, p5=0, p95=0),
182
+ output_tokens=TokenStats(min=0, max=0, mean=0, median=0, p5=0, p95=0),
183
+ sum_total_tokens=0,
184
+ sum_input_tokens=0,
185
+ sum_output_tokens=0,
186
+ num_examples_over_token_limit=0,
187
+ ),
188
+ "Image": MetricCategory(
189
+ num_examples=0,
190
+ total_tokens=TokenStats(min=0, max=0, mean=0, median=0, p5=0, p95=0),
191
+ input_tokens=TokenStats(min=0, max=0, mean=0, median=0, p5=0, p95=0),
192
+ output_tokens=TokenStats(min=0, max=0, mean=0, median=0, p5=0, p95=0),
193
+ sum_total_tokens=0,
194
+ sum_input_tokens=0,
195
+ sum_output_tokens=0,
196
+ num_examples_over_token_limit=0,
197
+ ),
198
+ "Total": MetricCategory(
199
+ num_examples=0,
200
+ total_tokens=TokenStats(min=0, max=0, mean=0, median=0, p5=0, p95=0),
201
+ input_tokens=TokenStats(min=0, max=0, mean=0, median=0, p5=0, p95=0),
202
+ output_tokens=TokenStats(min=0, max=0, mean=0, median=0, p5=0, p95=0),
203
+ sum_total_tokens=0,
204
+ sum_input_tokens=0,
205
+ sum_output_tokens=0,
206
+ num_examples_over_token_limit=0,
207
+ ),
208
+ }
209
+
210
+ # Accumulate token counts
211
+ input_text_tokens = []
212
+ output_text_tokens = []
213
+ messages_text_tokens = []
214
+
215
+ input_image_tokens = []
216
+ output_image_tokens = []
217
+ messages_image_tokens = []
218
+
219
+ input_total_tokens = []
220
+ output_total_tokens = []
221
+ messages_total_tokens = []
222
+
223
+ with open(jsonl_path, "r", encoding="utf-8") as file:
224
+ for line in file:
225
+ example = json.loads(line)
226
+
227
+ input_text_tokens_example = 0
228
+ output_text_tokens_example = 0
229
+
230
+ input_image_tokens_example = 0
231
+ output_image_tokens_example = 0
232
+
233
+ for message in example.get("messages", []):
234
+ role = message.get("role")
235
+ content = message.get("content")
236
+
237
+ if isinstance(content, str):
238
+ if role in ["developer", "system", "user"]:
239
+ input_text_tokens_example += count_text_tokens(content)
240
+ elif role == "assistant":
241
+ output_text_tokens_example += count_text_tokens(content)
242
+ elif isinstance(content, list): # Handle images
243
+ for item in content:
244
+ if item.get("type") == "image_url" and "image_url" in item:
245
+ image_url = item["image_url"]["url"]
246
+ detail = item["image_url"]["detail"]
247
+ tokens = count_image_tokens(image_url, detail)
248
+ if role in ["developer", "system", "user"]:
249
+ input_image_tokens_example += tokens
250
+ elif role == "assistant":
251
+ output_image_tokens_example += tokens
252
+
253
+ elif item.get("type") == "text":
254
+ if role in ["developer", "system", "user"]:
255
+ input_text_tokens_example += count_text_tokens(item["text"])
256
+ elif role == "assistant":
257
+ output_text_tokens_example += count_text_tokens(item["text"])
258
+
259
+ # Calculate totals for the example
260
+ example_total_tokens = input_text_tokens_example + output_text_tokens_example + input_image_tokens_example + output_image_tokens_example
261
+
262
+ # Add to accumulators
263
+ input_text_tokens.append(input_text_tokens_example)
264
+ output_text_tokens.append(output_text_tokens_example)
265
+ messages_text_tokens.append(input_text_tokens_example + output_text_tokens_example)
266
+
267
+ input_image_tokens.append(input_image_tokens_example)
268
+ output_image_tokens.append(output_image_tokens_example)
269
+ messages_image_tokens.append(input_image_tokens_example + output_image_tokens_example)
270
+
271
+ input_total_tokens.append(input_text_tokens_example + input_image_tokens_example)
272
+ output_total_tokens.append(output_text_tokens_example + output_image_tokens_example)
273
+ messages_total_tokens.append(input_text_tokens_example + output_text_tokens_example + input_image_tokens_example + output_image_tokens_example)
274
+
275
+ # Count examples over token limit
276
+ if input_text_tokens_example > token_limit:
277
+ metrics["Text"]["num_examples_over_token_limit"] += 1
278
+ if input_image_tokens_example > token_limit:
279
+ metrics["Image"]["num_examples_over_token_limit"] += 1
280
+ if example_total_tokens > token_limit:
281
+ metrics["Total"]["num_examples_over_token_limit"] += 1
282
+ # print(example_total_tokens, token_limit)
283
+
284
+ # Update metrics for Text, Image, and Total
285
+ metrics["Text"]["num_examples"] = len(input_text_tokens)
286
+ metrics["Text"]["total_tokens"] = calculate_statistics(messages_text_tokens)
287
+ metrics["Text"]["input_tokens"] = calculate_statistics(input_text_tokens)
288
+ metrics["Text"]["output_tokens"] = calculate_statistics(output_text_tokens)
289
+ metrics["Text"]["sum_input_tokens"] = sum(input_text_tokens)
290
+ metrics["Text"]["sum_output_tokens"] = sum(output_text_tokens)
291
+ metrics["Text"]["sum_total_tokens"] = sum(messages_text_tokens)
292
+
293
+ metrics["Image"]["num_examples"] = len(input_image_tokens)
294
+ metrics["Image"]["total_tokens"] = calculate_statistics(messages_image_tokens)
295
+ metrics["Image"]["input_tokens"] = calculate_statistics(input_image_tokens)
296
+ metrics["Image"]["output_tokens"] = calculate_statistics(output_image_tokens)
297
+ metrics["Image"]["sum_input_tokens"] = sum(input_image_tokens)
298
+ metrics["Image"]["sum_output_tokens"] = sum(output_image_tokens)
299
+ metrics["Image"]["sum_total_tokens"] = sum(messages_image_tokens)
300
+
301
+ metrics["Total"]["num_examples"] = len(input_total_tokens)
302
+ metrics["Total"]["total_tokens"] = calculate_statistics(messages_total_tokens)
303
+ metrics["Total"]["input_tokens"] = calculate_statistics(input_total_tokens)
304
+ metrics["Total"]["output_tokens"] = calculate_statistics(output_total_tokens)
305
+ metrics["Total"]["sum_input_tokens"] = sum(input_total_tokens)
306
+ metrics["Total"]["sum_output_tokens"] = sum(output_total_tokens)
307
+ metrics["Total"]["sum_total_tokens"] = sum(messages_total_tokens)
308
+
309
+ return metrics
310
+
311
+
312
+ def display_metrics(metrics: Metrics, input_token_price: Optional[float] = None, output_token_price: Optional[float] = None) -> None:
313
+ """
314
+ Display the metrics dictionary in a compact table with min/max, mean/median, and p5/p95 on the same row.
315
+ """
316
+ console = Console(style="on grey23")
317
+ table = Table(title="Dataset Metrics", show_lines=True)
318
+
319
+ # Add columns
320
+ table.add_column("Metric", justify="left", style="#BDE8F6", no_wrap=True)
321
+ table.add_column("Text", justify="right", style="#C2BDF6")
322
+ table.add_column("Image", justify="right", style="#F6BDBD")
323
+ table.add_column("Total", justify="right", style="#F6E4BD")
324
+
325
+ # Add rows
326
+ table.add_row("Num Examples", str(metrics["Text"]["num_examples"]), str(metrics["Image"]["num_examples"]), str(metrics["Total"]["num_examples"]))
327
+
328
+ table.add_row(
329
+ "Examples Over Limit",
330
+ str(metrics["Text"]["num_examples_over_token_limit"]),
331
+ str(metrics["Image"]["num_examples_over_token_limit"]),
332
+ str(metrics["Total"]["num_examples_over_token_limit"]),
333
+ )
334
+
335
+ table.add_row("")
336
+
337
+ # Rows for input tokens
338
+ table.add_row(
339
+ "Min / Max Input Tokens",
340
+ f"{metrics['Text']['input_tokens']['min']:.0f} / {metrics['Text']['input_tokens']['max']:.0f}",
341
+ f"{metrics['Image']['input_tokens']['min']:.0f} / {metrics['Image']['input_tokens']['max']:.0f}",
342
+ f"{metrics['Total']['input_tokens']['min']:.0f} / {metrics['Total']['input_tokens']['max']:.0f}",
343
+ )
344
+
345
+ table.add_row(
346
+ "Mean / Median Input Tokens",
347
+ f"{metrics['Text']['input_tokens']['mean']:.0f} / {metrics['Text']['input_tokens']['median']:.0f}",
348
+ f"{metrics['Image']['input_tokens']['mean']:.0f} / {metrics['Image']['input_tokens']['median']:.0f}",
349
+ f"{metrics['Total']['input_tokens']['mean']:.0f} / {metrics['Total']['input_tokens']['median']:.0f}",
350
+ )
351
+
352
+ table.add_row(
353
+ "P5 / P95 Input Tokens",
354
+ f"{metrics['Text']['input_tokens']['p5']:.0f} / {metrics['Text']['input_tokens']['p95']:.0f}",
355
+ f"{metrics['Image']['input_tokens']['p5']:.0f} / {metrics['Image']['input_tokens']['p95']:.0f}",
356
+ f"{metrics['Total']['input_tokens']['p5']:.0f} / {metrics['Total']['input_tokens']['p95']:.0f}",
357
+ )
358
+
359
+ table.add_row("Sum Input Tokens", f"{metrics['Text']['sum_input_tokens']}", f"{metrics['Image']['sum_input_tokens']}", f"{metrics['Total']['sum_input_tokens']}")
360
+
361
+ table.add_row("") # Empty row for spacing
362
+
363
+ # Rows for output tokens
364
+ table.add_row(
365
+ "Min / Max Output Tokens",
366
+ f"{metrics['Text']['output_tokens']['min']:.0f} / {metrics['Text']['output_tokens']['max']:.0f}",
367
+ f"{metrics['Image']['output_tokens']['min']:.0f} / {metrics['Image']['output_tokens']['max']:.0f}",
368
+ f"{metrics['Total']['output_tokens']['min']:.0f} / {metrics['Total']['output_tokens']['max']:.0f}",
369
+ )
370
+
371
+ table.add_row(
372
+ "Mean / Median Output Tokens",
373
+ f"{metrics['Text']['output_tokens']['mean']:.0f} / {metrics['Text']['output_tokens']['median']:.0f}",
374
+ f"{metrics['Image']['output_tokens']['mean']:.0f} / {metrics['Image']['output_tokens']['median']:.0f}",
375
+ f"{metrics['Total']['output_tokens']['mean']:.0f} / {metrics['Total']['output_tokens']['median']:.0f}",
376
+ )
377
+
378
+ table.add_row(
379
+ "P5 / P95 Output Tokens",
380
+ f"{metrics['Text']['output_tokens']['p5']:.0f} / {metrics['Text']['output_tokens']['p95']:.0f}",
381
+ f"{metrics['Image']['output_tokens']['p5']:.0f} / {metrics['Image']['output_tokens']['p95']:.0f}",
382
+ f"{metrics['Total']['output_tokens']['p5']:.0f} / {metrics['Total']['output_tokens']['p95']:.0f}",
383
+ )
384
+
385
+ table.add_row("Sum Output Tokens", f"{metrics['Text']['sum_output_tokens']}", f"{metrics['Image']['sum_output_tokens']}", f"{metrics['Total']['sum_output_tokens']}")
386
+
387
+ table.add_row("") # Empty row for spacing
388
+
389
+ # Total tokens
390
+ table.add_row(
391
+ "Min / Max Tokens",
392
+ f"{metrics['Text']['input_tokens']['min']:.0f} / {metrics['Text']['input_tokens']['max']:.0f}",
393
+ f"{metrics['Image']['input_tokens']['min']:.0f} / {metrics['Image']['input_tokens']['max']:.0f}",
394
+ f"{metrics['Total']['input_tokens']['min']:.0f} / {metrics['Total']['input_tokens']['max']:.0f}",
395
+ )
396
+
397
+ table.add_row(
398
+ "Mean / Median Tokens",
399
+ f"{metrics['Text']['input_tokens']['mean']:.0f} / {metrics['Text']['input_tokens']['median']:.0f}",
400
+ f"{metrics['Image']['input_tokens']['mean']:.0f} / {metrics['Image']['input_tokens']['median']:.0f}",
401
+ f"{metrics['Total']['input_tokens']['mean']:.0f} / {metrics['Total']['input_tokens']['median']:.0f}",
402
+ )
403
+
404
+ table.add_row(
405
+ "P5 / P95 Tokens",
406
+ f"{metrics['Text']['input_tokens']['p5']:.0f} / {metrics['Text']['input_tokens']['p95']:.0f}",
407
+ f"{metrics['Image']['input_tokens']['p5']:.0f} / {metrics['Image']['input_tokens']['p95']:.0f}",
408
+ f"{metrics['Total']['input_tokens']['p5']:.0f} / {metrics['Total']['input_tokens']['p95']:.0f}",
409
+ )
410
+
411
+ table.add_row("Sum Total Tokens", f"{metrics['Text']['sum_input_tokens']}", f"{metrics['Image']['sum_input_tokens']}", f"{metrics['Total']['sum_input_tokens']}")
412
+
413
+ table.add_row("") # Empty row for spacing
414
+
415
+ if input_token_price is not None:
416
+ table.add_row(
417
+ "Input Cost",
418
+ f"{metrics['Text']['sum_input_tokens'] * input_token_price:.2f} USD",
419
+ f"{metrics['Image']['sum_input_tokens'] * input_token_price:.2f} USD",
420
+ f"{metrics['Total']['sum_input_tokens'] * input_token_price:.2f} USD",
421
+ )
422
+
423
+ if output_token_price is not None:
424
+ table.add_row(
425
+ "Output Cost",
426
+ f"{metrics['Text']['sum_output_tokens'] * output_token_price:.2f} USD",
427
+ f"{metrics['Image']['sum_output_tokens'] * output_token_price:.2f} USD",
428
+ f"{metrics['Total']['sum_output_tokens'] * output_token_price:.2f} USD",
429
+ )
430
+
431
+ if input_token_price is not None and output_token_price is not None:
432
+ table.add_row(
433
+ "Total Cost",
434
+ f"{metrics['Text']['sum_total_tokens'] * input_token_price:.2f} USD",
435
+ f"{metrics['Image']['sum_total_tokens'] * input_token_price:.2f} USD",
436
+ f"{metrics['Total']['sum_total_tokens'] * input_token_price:.2f} USD",
437
+ )
438
+
439
+ # Print the table
440
+ console.print(table)