valor-lite 0.37.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of valor-lite might be problematic. Click here for more details.

Files changed (49) hide show
  1. valor_lite/LICENSE +21 -0
  2. valor_lite/__init__.py +0 -0
  3. valor_lite/cache/__init__.py +11 -0
  4. valor_lite/cache/compute.py +154 -0
  5. valor_lite/cache/ephemeral.py +302 -0
  6. valor_lite/cache/persistent.py +529 -0
  7. valor_lite/classification/__init__.py +14 -0
  8. valor_lite/classification/annotation.py +45 -0
  9. valor_lite/classification/computation.py +378 -0
  10. valor_lite/classification/evaluator.py +879 -0
  11. valor_lite/classification/loader.py +97 -0
  12. valor_lite/classification/metric.py +535 -0
  13. valor_lite/classification/numpy_compatibility.py +13 -0
  14. valor_lite/classification/shared.py +184 -0
  15. valor_lite/classification/utilities.py +314 -0
  16. valor_lite/exceptions.py +20 -0
  17. valor_lite/object_detection/__init__.py +17 -0
  18. valor_lite/object_detection/annotation.py +238 -0
  19. valor_lite/object_detection/computation.py +841 -0
  20. valor_lite/object_detection/evaluator.py +805 -0
  21. valor_lite/object_detection/loader.py +292 -0
  22. valor_lite/object_detection/metric.py +850 -0
  23. valor_lite/object_detection/shared.py +185 -0
  24. valor_lite/object_detection/utilities.py +396 -0
  25. valor_lite/schemas.py +11 -0
  26. valor_lite/semantic_segmentation/__init__.py +15 -0
  27. valor_lite/semantic_segmentation/annotation.py +123 -0
  28. valor_lite/semantic_segmentation/computation.py +165 -0
  29. valor_lite/semantic_segmentation/evaluator.py +414 -0
  30. valor_lite/semantic_segmentation/loader.py +205 -0
  31. valor_lite/semantic_segmentation/metric.py +275 -0
  32. valor_lite/semantic_segmentation/shared.py +149 -0
  33. valor_lite/semantic_segmentation/utilities.py +88 -0
  34. valor_lite/text_generation/__init__.py +15 -0
  35. valor_lite/text_generation/annotation.py +56 -0
  36. valor_lite/text_generation/computation.py +611 -0
  37. valor_lite/text_generation/llm/__init__.py +0 -0
  38. valor_lite/text_generation/llm/exceptions.py +14 -0
  39. valor_lite/text_generation/llm/generation.py +903 -0
  40. valor_lite/text_generation/llm/instructions.py +814 -0
  41. valor_lite/text_generation/llm/integrations.py +226 -0
  42. valor_lite/text_generation/llm/utilities.py +43 -0
  43. valor_lite/text_generation/llm/validators.py +68 -0
  44. valor_lite/text_generation/manager.py +697 -0
  45. valor_lite/text_generation/metric.py +381 -0
  46. valor_lite-0.37.1.dist-info/METADATA +174 -0
  47. valor_lite-0.37.1.dist-info/RECORD +49 -0
  48. valor_lite-0.37.1.dist-info/WHEEL +5 -0
  49. valor_lite-0.37.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,529 @@
1
+ import base64
2
+ import glob
3
+ import json
4
+ import os
5
+ from collections.abc import Iterator
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import numpy as np
10
+ import pyarrow as pa
11
+ import pyarrow.compute as pc
12
+ import pyarrow.dataset as ds
13
+ import pyarrow.parquet as pq
14
+
15
+
16
+ class FileCache:
17
+ def __init__(
18
+ self,
19
+ path: str | Path,
20
+ schema: pa.Schema,
21
+ batch_size: int,
22
+ rows_per_file: int,
23
+ compression: str,
24
+ ):
25
+ self._path = Path(path)
26
+ self._schema = schema
27
+ self._batch_size = batch_size
28
+ self._rows_per_file = rows_per_file
29
+ self._compression = compression
30
+
31
+ @property
32
+ def path(self) -> Path:
33
+ return self._path
34
+
35
+ @property
36
+ def schema(self) -> pa.Schema:
37
+ return self._schema
38
+
39
+ @property
40
+ def batch_size(self) -> int:
41
+ return self._batch_size
42
+
43
+ @property
44
+ def rows_per_file(self) -> int:
45
+ return self._rows_per_file
46
+
47
+ @property
48
+ def compression(self) -> str:
49
+ return self._compression
50
+
51
+ @staticmethod
52
+ def _generate_config_path(path: str | Path) -> Path:
53
+ """Generate cache configuration path."""
54
+ return Path(path) / ".cfg"
55
+
56
+ @staticmethod
57
+ def _encode_schema(schema: pa.Schema) -> str:
58
+ """Encode schema to b64 string."""
59
+ schema_bytes = schema.serialize()
60
+ return base64.b64encode(schema_bytes).decode("utf-8")
61
+
62
+ @staticmethod
63
+ def _decode_schema(encoded_schema: str) -> pa.Schema:
64
+ """Decode schema from b64 string."""
65
+ schema_bytes = base64.b64decode(encoded_schema)
66
+ return pa.ipc.read_schema(pa.BufferReader(schema_bytes))
67
+
68
+ def count_rows(self) -> int:
69
+ """Count the number of rows in the cache."""
70
+ dataset = ds.dataset(
71
+ source=self._path,
72
+ format="parquet",
73
+ )
74
+ return dataset.count_rows()
75
+
76
+ def count_tables(self) -> int:
77
+ """Count the number of files in the cache."""
78
+ return len(self.get_dataset_files())
79
+
80
+ def get_files(self) -> list[Path]:
81
+ """
82
+ Retrieve all files.
83
+
84
+ Returns
85
+ -------
86
+ list[Path]
87
+ A list of paths to files in the cache.
88
+ """
89
+ if not self._path.exists():
90
+ return []
91
+ files = []
92
+ for entry in os.listdir(self._path):
93
+ full_path = os.path.join(self._path, entry)
94
+ if os.path.isfile(full_path):
95
+ files.append(Path(full_path))
96
+ return files
97
+
98
+ def get_dataset_files(self) -> list[Path]:
99
+ """
100
+ Retrieve all dataset files.
101
+
102
+ Returns
103
+ -------
104
+ list[Path]
105
+ A list of paths to dataset files in the cache.
106
+ """
107
+ if not self._path.exists():
108
+ return []
109
+ return [
110
+ Path(filepath) for filepath in glob.glob(f"{self._path}/*.parquet")
111
+ ]
112
+
113
+
114
+ class FileCacheReader(FileCache):
115
+ @classmethod
116
+ def load(cls, path: str | Path):
117
+ """
118
+ Load cache from disk.
119
+
120
+ Parameters
121
+ ----------
122
+ path : str | Path
123
+ Where the cache is stored.
124
+ """
125
+ path = Path(path)
126
+ if not path.exists():
127
+ raise FileNotFoundError(f"Directory does not exist: {path}")
128
+ elif not path.is_dir():
129
+ raise NotADirectoryError(
130
+ f"Path exists but is not a directory: {path}"
131
+ )
132
+
133
+ def _retrieve(config: dict, key: str):
134
+ if value := config.get(key, None):
135
+ return value
136
+ raise KeyError(
137
+ f"'{key}' is not defined within {cls._generate_config_path(path)}"
138
+ )
139
+
140
+ # read configuration file
141
+ cfg_path = cls._generate_config_path(path)
142
+ with open(cfg_path, "r") as f:
143
+ cfg = json.load(f)
144
+ batch_size = _retrieve(cfg, "batch_size")
145
+ rows_per_file = _retrieve(cfg, "rows_per_file")
146
+ compression = _retrieve(cfg, "compression")
147
+ schema = cls._decode_schema(_retrieve(cfg, "schema"))
148
+
149
+ return cls(
150
+ schema=schema,
151
+ path=path,
152
+ batch_size=batch_size,
153
+ rows_per_file=rows_per_file,
154
+ compression=compression,
155
+ )
156
+
157
+ def iterate_tables(
158
+ self,
159
+ columns: list[str] | None = None,
160
+ filter: pc.Expression | None = None,
161
+ ) -> Iterator[pa.Table]:
162
+ """
163
+ Iterate over tables within the cache.
164
+
165
+ Parameters
166
+ ----------
167
+ columns : list[str], optional
168
+ Optionally select columns to be returned.
169
+ filter : pyarrow.compute.Expression, optional
170
+ Optionally filter table before returning.
171
+
172
+ Returns
173
+ -------
174
+ Iterator[pa.Table]
175
+ """
176
+ dataset = ds.dataset(
177
+ source=self._path,
178
+ schema=self._schema,
179
+ format="parquet",
180
+ )
181
+ for fragment in dataset.get_fragments():
182
+ yield fragment.to_table(columns=columns, filter=filter)
183
+
184
+ def iterate_arrays(
185
+ self,
186
+ numeric_columns: list[str] | None = None,
187
+ filter: pc.Expression | None = None,
188
+ ) -> Iterator[np.ndarray]:
189
+ """
190
+ Iterate over chunks within the cache returning arrays.
191
+
192
+ Parameters
193
+ ----------
194
+ numeric_columns : list[str], optional
195
+ Optionally select numeric columns to be returned within an array.
196
+ filter : pyarrow.compute.Expression, optional
197
+ Optionally filter table before returning.
198
+
199
+ Returns
200
+ -------
201
+ Iterator[np.ndarray]
202
+ """
203
+ for tbl in self.iterate_tables(
204
+ columns=numeric_columns,
205
+ filter=filter,
206
+ ):
207
+ yield np.column_stack(
208
+ [tbl.column(i).to_numpy() for i in range(tbl.num_columns)]
209
+ )
210
+
211
+ def iterate_tables_with_arrays(
212
+ self,
213
+ columns: list[str] | None = None,
214
+ filter: pc.Expression | None = None,
215
+ numeric_columns: list[str] | None = None,
216
+ ) -> Iterator[tuple[pa.Table, np.ndarray]]:
217
+ """
218
+ Iterate over chunks within the cache returning both tables and arrays.
219
+
220
+ Parameters
221
+ ----------
222
+ columns : list[str], optional
223
+ Optionally select columns to be returned.
224
+ filter : pyarrow.compute.Expression, optional
225
+ Optionally filter table before returning.
226
+ numeric_columns : list[str], optional
227
+ Optionally select numeric columns to be returned within an array.
228
+
229
+ Returns
230
+ -------
231
+ Iterator[tuple[pa.Table, np.ndarray]]
232
+
233
+ """
234
+ _columns = set(columns) if columns else set()
235
+ _numeric_columns = set(numeric_columns) if numeric_columns else set()
236
+ columns = list(_columns.union(_numeric_columns))
237
+ for tbl in self.iterate_tables(
238
+ columns=columns,
239
+ filter=filter,
240
+ ):
241
+ table_columns = numeric_columns if numeric_columns else tbl.columns
242
+ yield tbl, np.column_stack(
243
+ [tbl[col].to_numpy() for col in table_columns]
244
+ )
245
+
246
+ def iterate_fragments(
247
+ self, batch_size: int
248
+ ) -> Iterator[Iterator[pa.RecordBatch]]:
249
+ """
250
+ Iterate over fragment batch iterators within the file-based cache.
251
+
252
+ Parameters
253
+ ----------
254
+ batch_size : int
255
+ Maximum number of rows allowed to be read into memory per cache file.
256
+
257
+ Returns
258
+ -------
259
+ Iterator[Iterator[pa.RecordBatch]]
260
+ """
261
+ dataset = ds.dataset(
262
+ source=self._path,
263
+ schema=self._schema,
264
+ format="parquet",
265
+ )
266
+ for fragment in dataset.get_fragments():
267
+ yield fragment.to_batches(batch_size=batch_size)
268
+
269
+
270
+ class FileCacheWriter(FileCache):
271
+ def __init__(
272
+ self,
273
+ path: str | Path,
274
+ schema: pa.Schema,
275
+ batch_size: int,
276
+ rows_per_file: int,
277
+ compression: str,
278
+ ):
279
+ super().__init__(
280
+ path=path,
281
+ schema=schema,
282
+ batch_size=batch_size,
283
+ rows_per_file=rows_per_file,
284
+ compression=compression,
285
+ )
286
+
287
+ # internal state
288
+ self._writer = None
289
+ self._buffer = []
290
+ self._count = 0
291
+
292
+ @classmethod
293
+ def create(
294
+ cls,
295
+ path: str | Path,
296
+ schema: pa.Schema,
297
+ batch_size: int,
298
+ rows_per_file: int,
299
+ compression: str = "snappy",
300
+ delete_if_exists: bool = False,
301
+ ):
302
+ """
303
+ Create an on-disk cache.
304
+
305
+ Parameters
306
+ ----------
307
+ path : str | Path
308
+ Where to write the cache.
309
+ schema : pa.Schema
310
+ Cache schema.
311
+ batch_size : int
312
+ Target batch size when writing chunks.
313
+ rows_per_file : int
314
+ Target number of rows to store per file.
315
+ compression : str, default="snappy"
316
+ Compression method to use when storing on disk.
317
+ delete_if_exists : bool, default=False
318
+ Delete the cache if it already exists.
319
+ """
320
+ path = Path(path)
321
+ if delete_if_exists and path.exists():
322
+ cls.delete(path)
323
+ Path(path).mkdir(parents=True, exist_ok=False)
324
+
325
+ # write configuration file
326
+ cfg_path = cls._generate_config_path(path)
327
+ with open(cfg_path, "w") as f:
328
+ cfg = dict(
329
+ batch_size=batch_size,
330
+ rows_per_file=rows_per_file,
331
+ compression=compression,
332
+ schema=cls._encode_schema(schema),
333
+ )
334
+ json.dump(cfg, f, indent=2)
335
+
336
+ return cls(
337
+ schema=schema,
338
+ path=path,
339
+ batch_size=batch_size,
340
+ rows_per_file=rows_per_file,
341
+ compression=compression,
342
+ )
343
+
344
+ @classmethod
345
+ def delete(cls, path: str | Path):
346
+ """
347
+ Delete a cache at path.
348
+
349
+ Parameters
350
+ ----------
351
+ path : str | Path
352
+ Where the cache is stored.
353
+ """
354
+ path = Path(path)
355
+ if not path.exists():
356
+ return
357
+
358
+ # delete dataset files
359
+ reader = FileCacheReader.load(path)
360
+ for file in reader.get_dataset_files():
361
+ if file.exists() and file.is_file() and file.suffix == ".parquet":
362
+ file.unlink()
363
+
364
+ # delete config file
365
+ cfg_path = cls._generate_config_path(path)
366
+ if cfg_path.exists() and cfg_path.is_file():
367
+ cfg_path.unlink()
368
+
369
+ # delete empty cache directory
370
+ path.rmdir()
371
+
372
+ def write_rows(
373
+ self,
374
+ rows: list[dict[str, Any]],
375
+ ):
376
+ """
377
+ Write rows to cache.
378
+
379
+ Parameters
380
+ ----------
381
+ rows : list[dict[str, Any]]
382
+ A list of rows represented by dictionaries mapping fields to values.
383
+ """
384
+ if not rows:
385
+ return
386
+ batch = pa.RecordBatch.from_pylist(rows, schema=self._schema)
387
+ self.write_batch(batch)
388
+
389
+ def write_columns(
390
+ self,
391
+ columns: dict[str, list | np.ndarray | pa.Array],
392
+ ):
393
+ """
394
+ Write columnar data to cache.
395
+
396
+ Parameters
397
+ ----------
398
+ columns : dict[str, list | np.ndarray | pa.Array]
399
+ A mapping of columnar field names to list of values.
400
+ """
401
+ if not columns:
402
+ return
403
+ batch = pa.RecordBatch.from_pydict(columns)
404
+ self.write_batch(batch)
405
+
406
+ def write_batch(
407
+ self,
408
+ batch: pa.RecordBatch,
409
+ ):
410
+ """
411
+ Write a batch to cache.
412
+
413
+ Parameters
414
+ ----------
415
+ batch : pa.RecordBatch
416
+ A batch of columnar data.
417
+ """
418
+ size = batch.num_rows
419
+ if self._buffer:
420
+ size += sum([b.num_rows for b in self._buffer])
421
+
422
+ # check size
423
+ if size < self.batch_size and self._count < self.rows_per_file:
424
+ self._buffer.append(batch)
425
+ return
426
+
427
+ if self._buffer:
428
+ self._buffer.append(batch)
429
+ batch = pa.concat_batches(self._buffer)
430
+ self._buffer = []
431
+
432
+ # write batch
433
+ writer = self._get_or_create_writer()
434
+ writer.write_batch(batch)
435
+
436
+ # check file size
437
+ self._count += size
438
+ if self._count >= self.rows_per_file:
439
+ self.flush()
440
+
441
+ def write_table(
442
+ self,
443
+ table: pa.Table,
444
+ ):
445
+ """
446
+ Write a table directly to cache.
447
+
448
+ Parameters
449
+ ----------
450
+ table : pa.Table
451
+ A populated table.
452
+ """
453
+ self.flush()
454
+ pq.write_table(table, where=self._generate_next_filename())
455
+
456
+ def flush(self):
457
+ """Flush the cache buffer."""
458
+ if self._buffer:
459
+ combined_arrays = [
460
+ pa.concat_arrays([b.column(name) for b in self._buffer])
461
+ for name in self._schema.names
462
+ ]
463
+ batch = pa.RecordBatch.from_arrays(
464
+ combined_arrays, schema=self._schema
465
+ )
466
+ writer = self._get_or_create_writer()
467
+ writer.write_batch(batch)
468
+ self._buffer = []
469
+ self._count = 0
470
+ self._close_writer()
471
+
472
+ def sort_by(
473
+ self,
474
+ sorting: list[tuple[str, str]],
475
+ ):
476
+ """
477
+ Sort cache files locally and in-place.
478
+
479
+ Parameters
480
+ ----------
481
+ sorting : list[tuple[str, str]]
482
+ Sorting arguments in PyArrow format (e.g. [('a', 'ascending'), ('b', 'descending')]).
483
+ """
484
+ self.flush()
485
+ for file in self.get_dataset_files():
486
+ pf = pq.ParquetFile(file)
487
+ tbl = pf.read()
488
+ pf.close()
489
+ sorted_tbl = tbl.sort_by(sorting)
490
+ pq.write_table(sorted_tbl, file)
491
+
492
+ def _generate_next_filename(self) -> Path:
493
+ """Generates next dataset filepath."""
494
+ files = self.get_dataset_files()
495
+ if not files:
496
+ next_index = 0
497
+ else:
498
+ next_index = max([int(Path(f).stem) for f in files]) + 1
499
+ return self._path / f"{next_index:06d}.parquet"
500
+
501
+ def _get_or_create_writer(self) -> pq.ParquetWriter:
502
+ """Open a new parquet file for writing."""
503
+ if self._writer is not None:
504
+ return self._writer
505
+ self._writer = pq.ParquetWriter(
506
+ where=self._generate_next_filename(),
507
+ schema=self._schema,
508
+ compression=self._compression,
509
+ )
510
+ return self._writer
511
+
512
+ def _close_writer(self) -> None:
513
+ """Close the current parquet file."""
514
+ if self._writer is not None:
515
+ self._writer.close()
516
+ self._writer = None
517
+
518
+ def __enter__(self):
519
+ """Context manager entry."""
520
+ return self
521
+
522
+ def __exit__(self, exc_type, exc_val, exc_tb):
523
+ """Context manager exit - ensures data is flushed."""
524
+ self.flush()
525
+
526
+ def to_reader(self) -> FileCacheReader:
527
+ """Get cache reader."""
528
+ self.flush()
529
+ return FileCacheReader.load(path=self.path)
@@ -0,0 +1,14 @@
1
+ from .annotation import Classification
2
+ from .evaluator import Evaluator
3
+ from .loader import Loader
4
+ from .metric import Metric, MetricType
5
+ from .shared import EvaluatorInfo
6
+
7
+ __all__ = [
8
+ "Classification",
9
+ "MetricType",
10
+ "Loader",
11
+ "Evaluator",
12
+ "Metric",
13
+ "EvaluatorInfo",
14
+ ]
@@ -0,0 +1,45 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any
3
+
4
+
5
+ @dataclass
6
+ class Classification:
7
+ """
8
+ Classification data structure containing a ground truth label and a list of predictions.
9
+
10
+ Parameters
11
+ ----------
12
+ uid : str
13
+ Unique identifier for the instance.
14
+ groundtruth : str
15
+ The true label for the instance.
16
+ predictions : list of str
17
+ List of predicted labels.
18
+ scores : list of float
19
+ Confidence scores corresponding to each predicted label.
20
+ metadata : dict[str, Any], optional
21
+ A dictionary containing any metadata to be used within filtering operations.
22
+
23
+ Examples
24
+ --------
25
+ >>> classification = Classification(
26
+ ... uid='123',
27
+ ... groundtruth='cat',
28
+ ... predictions=['cat', 'dog', 'bird'],
29
+ ... scores=[0.9, 0.05, 0.05]
30
+ ... )
31
+ """
32
+
33
+ uid: str
34
+ groundtruth: str
35
+ predictions: list[str]
36
+ scores: list[float]
37
+ metadata: dict[str, Any] | None = None
38
+
39
+ def __post_init__(self):
40
+ if not isinstance(self.groundtruth, str):
41
+ raise ValueError(
42
+ "A classification must contain a single groundtruth."
43
+ )
44
+ if len(self.predictions) != len(self.scores):
45
+ raise ValueError("There must be a score per prediction label.")