valor-lite 0.36.3__tar.gz → 0.37.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {valor_lite-0.36.3 → valor_lite-0.37.1}/PKG-INFO +2 -1
  2. {valor_lite-0.36.3 → valor_lite-0.37.1}/pyproject.toml +1 -0
  3. valor_lite-0.37.1/valor_lite/cache/__init__.py +11 -0
  4. valor_lite-0.37.1/valor_lite/cache/compute.py +154 -0
  5. valor_lite-0.37.1/valor_lite/cache/ephemeral.py +302 -0
  6. valor_lite-0.37.1/valor_lite/cache/persistent.py +529 -0
  7. valor_lite-0.37.1/valor_lite/classification/__init__.py +14 -0
  8. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/classification/annotation.py +4 -0
  9. valor_lite-0.37.1/valor_lite/classification/computation.py +378 -0
  10. valor_lite-0.37.1/valor_lite/classification/evaluator.py +879 -0
  11. valor_lite-0.37.1/valor_lite/classification/loader.py +97 -0
  12. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/classification/metric.py +141 -4
  13. valor_lite-0.37.1/valor_lite/classification/shared.py +184 -0
  14. valor_lite-0.37.1/valor_lite/classification/utilities.py +314 -0
  15. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/exceptions.py +5 -0
  16. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/object_detection/__init__.py +5 -4
  17. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/object_detection/annotation.py +13 -1
  18. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/object_detection/computation.py +309 -292
  19. valor_lite-0.37.1/valor_lite/object_detection/evaluator.py +805 -0
  20. valor_lite-0.37.1/valor_lite/object_detection/loader.py +292 -0
  21. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/object_detection/metric.py +152 -3
  22. valor_lite-0.37.1/valor_lite/object_detection/shared.py +185 -0
  23. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/object_detection/utilities.py +182 -109
  24. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/semantic_segmentation/__init__.py +5 -4
  25. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/semantic_segmentation/annotation.py +35 -20
  26. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/semantic_segmentation/computation.py +20 -110
  27. valor_lite-0.37.1/valor_lite/semantic_segmentation/evaluator.py +414 -0
  28. valor_lite-0.37.1/valor_lite/semantic_segmentation/loader.py +205 -0
  29. valor_lite-0.37.1/valor_lite/semantic_segmentation/shared.py +149 -0
  30. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/semantic_segmentation/utilities.py +6 -23
  31. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite.egg-info/PKG-INFO +2 -1
  32. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite.egg-info/SOURCES.txt +13 -5
  33. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite.egg-info/requires.txt +1 -0
  34. valor_lite-0.36.3/valor_lite/classification/__init__.py +0 -19
  35. valor_lite-0.36.3/valor_lite/classification/computation.py +0 -396
  36. valor_lite-0.36.3/valor_lite/classification/manager.py +0 -523
  37. valor_lite-0.36.3/valor_lite/classification/utilities.py +0 -211
  38. valor_lite-0.36.3/valor_lite/object_detection/manager.py +0 -801
  39. valor_lite-0.36.3/valor_lite/profiling.py +0 -374
  40. valor_lite-0.36.3/valor_lite/semantic_segmentation/benchmark.py +0 -237
  41. valor_lite-0.36.3/valor_lite/semantic_segmentation/manager.py +0 -423
  42. {valor_lite-0.36.3 → valor_lite-0.37.1}/README.md +0 -0
  43. {valor_lite-0.36.3 → valor_lite-0.37.1}/setup.cfg +0 -0
  44. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/LICENSE +0 -0
  45. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/__init__.py +0 -0
  46. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/classification/numpy_compatibility.py +0 -0
  47. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/schemas.py +0 -0
  48. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/semantic_segmentation/metric.py +0 -0
  49. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/__init__.py +0 -0
  50. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/annotation.py +0 -0
  51. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/computation.py +0 -0
  52. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/llm/__init__.py +0 -0
  53. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/llm/exceptions.py +0 -0
  54. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/llm/generation.py +0 -0
  55. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/llm/instructions.py +0 -0
  56. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/llm/integrations.py +0 -0
  57. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/llm/utilities.py +0 -0
  58. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/llm/validators.py +0 -0
  59. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/manager.py +0 -0
  60. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite/text_generation/metric.py +0 -0
  61. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite.egg-info/dependency_links.txt +0 -0
  62. {valor_lite-0.36.3 → valor_lite-0.37.1}/valor_lite.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: valor-lite
3
- Version: 0.36.3
3
+ Version: 0.37.1
4
4
  Summary: Evaluate machine learning models.
5
5
  Project-URL: homepage, https://www.striveworks.com
6
6
  Requires-Python: >=3.10
@@ -8,6 +8,7 @@ Description-Content-Type: text/markdown
8
8
  Requires-Dist: numpy
9
9
  Requires-Dist: tqdm
10
10
  Requires-Dist: shapely
11
+ Requires-Dist: pyarrow
11
12
  Provides-Extra: nlp
12
13
  Requires-Dist: evaluate; extra == "nlp"
13
14
  Requires-Dist: nltk; extra == "nlp"
@@ -9,6 +9,7 @@ dependencies = [
9
9
  "numpy",
10
10
  "tqdm",
11
11
  "shapely",
12
+ "pyarrow",
12
13
  ]
13
14
 
14
15
  [project.urls]
@@ -0,0 +1,11 @@
1
+ from .compute import sort
2
+ from .ephemeral import MemoryCacheReader, MemoryCacheWriter
3
+ from .persistent import FileCacheReader, FileCacheWriter
4
+
5
+ __all__ = [
6
+ "FileCacheReader",
7
+ "FileCacheWriter",
8
+ "MemoryCacheReader",
9
+ "MemoryCacheWriter",
10
+ "sort",
11
+ ]
@@ -0,0 +1,154 @@
1
+ import heapq
2
+ import tempfile
3
+ from pathlib import Path
4
+ from typing import Callable
5
+
6
+ import pyarrow as pa
7
+
8
+ from valor_lite.cache.ephemeral import MemoryCacheReader, MemoryCacheWriter
9
+ from valor_lite.cache.persistent import FileCacheReader, FileCacheWriter
10
+
11
+
12
+ def _merge(
13
+ source: MemoryCacheReader | FileCacheReader,
14
+ sink: MemoryCacheWriter | FileCacheWriter,
15
+ intermediate_sink: MemoryCacheWriter | FileCacheWriter,
16
+ batch_size: int,
17
+ sorting: list[tuple[str, str]],
18
+ columns: list[str] | None = None,
19
+ table_sort_override: Callable[[pa.Table], pa.Table] | None = None,
20
+ ):
21
+ """Merge locally sorted cache fragments."""
22
+ for tbl in source.iterate_tables(columns=columns):
23
+ if table_sort_override is not None:
24
+ sorted_tbl = table_sort_override(tbl)
25
+ else:
26
+ sorted_tbl = tbl.sort_by(sorting)
27
+ intermediate_sink.write_table(sorted_tbl)
28
+ intermediate_source = intermediate_sink.to_reader()
29
+
30
+ # define merge key
31
+ def create_sort_key(
32
+ batches: list[pa.RecordBatch],
33
+ batch_idx: int,
34
+ row_idx: int,
35
+ ):
36
+ args = [
37
+ -batches[batch_idx][name][row_idx].as_py()
38
+ if direction == "descending"
39
+ else batches[batch_idx][name][row_idx].as_py()
40
+ for name, direction in sorting
41
+ ]
42
+ return (
43
+ *args,
44
+ batch_idx,
45
+ row_idx,
46
+ )
47
+
48
+ # merge sorted rows
49
+ heap = []
50
+ batch_iterators = []
51
+ batches = []
52
+ for batch_idx, batch_iter in enumerate(
53
+ intermediate_source.iterate_fragments(batch_size=batch_size)
54
+ ):
55
+ batch_iterators.append(batch_iter)
56
+ batches.append(next(batch_iterators[batch_idx], None))
57
+ if batches[batch_idx] is not None and len(batches[batch_idx]) > 0:
58
+ heap.append(create_sort_key(batches, batch_idx, 0))
59
+ heapq.heapify(heap)
60
+
61
+ while heap:
62
+ row = heapq.heappop(heap)
63
+ batch_idx = row[-2]
64
+ row_idx = row[-1]
65
+ row_table = batches[batch_idx].slice(row_idx, 1)
66
+ sink.write_batch(row_table)
67
+ row_idx += 1
68
+ if row_idx < len(batches[batch_idx]):
69
+ heapq.heappush(
70
+ heap,
71
+ create_sort_key(batches, batch_idx, row_idx),
72
+ )
73
+ else:
74
+ batches[batch_idx] = next(batch_iterators[batch_idx], None)
75
+ if batches[batch_idx] is not None and len(batches[batch_idx]) > 0:
76
+ heapq.heappush(
77
+ heap,
78
+ create_sort_key(batches, batch_idx, 0),
79
+ )
80
+
81
+ sink.flush()
82
+
83
+
84
+ def sort(
85
+ source: MemoryCacheReader | FileCacheReader,
86
+ sink: MemoryCacheWriter | FileCacheWriter,
87
+ batch_size: int,
88
+ sorting: list[tuple[str, str]],
89
+ columns: list[str] | None = None,
90
+ table_sort_override: Callable[[pa.Table], pa.Table] | None = None,
91
+ ):
92
+ """
93
+ Sort data into new cache.
94
+
95
+ Parameters
96
+ ----------
97
+ source : MemoryCacheReader | FileCacheReader
98
+ A read-only cache. If file-based, each file must be locally sorted.
99
+ sink : MemoryCacheWriter | FileCacheWriter
100
+ The cache where sorted data will be written.
101
+ batch_size : int
102
+ Maximum number of rows allowed to be read into memory per cache file.
103
+ sorting : list[tuple[str, str]]
104
+ Sorting arguments in PyArrow format (e.g. [('a', 'ascending'), ('b', 'descending')]).
105
+ Note that only numeric fields are supported.
106
+ columns : list[str], optional
107
+ Option to only read a subset of columns.
108
+ table_sort_override : Callable[[pa.Table], pa.Table], optional
109
+ Option to override sort function for singular cache fragments.
110
+ """
111
+
112
+ if source.count_tables() == 1:
113
+ for tbl in source.iterate_tables(columns=columns):
114
+ if table_sort_override is not None:
115
+ sorted_tbl = table_sort_override(tbl)
116
+ else:
117
+ sorted_tbl = tbl.sort_by(sorting)
118
+ sink.write_table(sorted_tbl)
119
+ sink.flush()
120
+ return
121
+
122
+ if isinstance(sink, FileCacheWriter):
123
+ with tempfile.TemporaryDirectory() as tmpdir:
124
+ intermediate_sink = FileCacheWriter.create(
125
+ path=Path(tmpdir) / "sorting_intermediate",
126
+ schema=sink.schema,
127
+ batch_size=sink.batch_size,
128
+ rows_per_file=sink.rows_per_file,
129
+ compression=sink.compression,
130
+ delete_if_exists=False,
131
+ )
132
+ _merge(
133
+ source=source,
134
+ sink=sink,
135
+ intermediate_sink=intermediate_sink,
136
+ batch_size=batch_size,
137
+ sorting=sorting,
138
+ columns=columns,
139
+ table_sort_override=table_sort_override,
140
+ )
141
+ else:
142
+ intermediate_sink = MemoryCacheWriter.create(
143
+ schema=sink.schema,
144
+ batch_size=sink.batch_size,
145
+ )
146
+ _merge(
147
+ source=source,
148
+ sink=sink,
149
+ intermediate_sink=intermediate_sink,
150
+ batch_size=batch_size,
151
+ sorting=sorting,
152
+ columns=columns,
153
+ table_sort_override=table_sort_override,
154
+ )
@@ -0,0 +1,302 @@
1
+ from collections.abc import Iterator
2
+ from typing import Any
3
+
4
+ import numpy as np
5
+ import pyarrow as pa
6
+ import pyarrow.compute as pc
7
+
8
+
9
+ class MemoryCache:
10
+ def __init__(
11
+ self,
12
+ table: pa.Table,
13
+ batch_size: int,
14
+ ):
15
+ self._table = table
16
+ self._batch_size = batch_size
17
+
18
+ @property
19
+ def schema(self) -> pa.Schema:
20
+ return self._table.schema
21
+
22
+ @property
23
+ def batch_size(self) -> int:
24
+ return self._batch_size
25
+
26
+ def count_tables(self) -> int:
27
+ """Count the number of tables in the cache."""
28
+ return 1
29
+
30
+ def count_rows(self) -> int:
31
+ """Count the number of rows in the cache."""
32
+ return self._table.num_rows
33
+
34
+
35
+ class MemoryCacheReader(MemoryCache):
36
+ def iterate_tables(
37
+ self,
38
+ columns: list[str] | None = None,
39
+ filter: pc.Expression | None = None,
40
+ ) -> Iterator[pa.Table]:
41
+ """
42
+ Iterate over tables within the cache.
43
+
44
+ Parameters
45
+ ----------
46
+ columns : list[str], optional
47
+ Optionally select columns to be returned.
48
+ filter : pyarrow.compute.Expression, optional
49
+ Optionally filter table before returning.
50
+
51
+ Returns
52
+ -------
53
+ Iterator[pa.Table]
54
+ """
55
+ table = self._table
56
+ if filter is not None:
57
+ table = table.filter(filter)
58
+ if columns is not None:
59
+ table = table.select(columns)
60
+ yield table
61
+
62
+ def iterate_arrays(
63
+ self,
64
+ numeric_columns: list[str] | None = None,
65
+ filter: pc.Expression | None = None,
66
+ ) -> Iterator[np.ndarray]:
67
+ """
68
+ Iterate over chunks within the cache returning arrays.
69
+
70
+ Parameters
71
+ ----------
72
+ numeric_columns : list[str], optional
73
+ Optionally select numeric columns to be returned within an array.
74
+ filter : pyarrow.compute.Expression, optional
75
+ Optionally filter table before returning.
76
+
77
+ Returns
78
+ -------
79
+ Iterator[np.ndarray]
80
+ """
81
+ for tbl in self.iterate_tables(columns=numeric_columns, filter=filter):
82
+ yield np.column_stack(
83
+ [tbl.column(i).to_numpy() for i in range(tbl.num_columns)]
84
+ )
85
+
86
+ def iterate_tables_with_arrays(
87
+ self,
88
+ columns: list[str] | None = None,
89
+ filter: pc.Expression | None = None,
90
+ numeric_columns: list[str] | None = None,
91
+ ) -> Iterator[tuple[pa.Table, np.ndarray]]:
92
+ """
93
+ Iterate over chunks within the cache returning both tables and arrays.
94
+
95
+ Parameters
96
+ ----------
97
+ columns : list[str], optional
98
+ Optionally select columns to be returned.
99
+ filter : pyarrow.compute.Expression, optional
100
+ Optionally filter table before returning.
101
+ numeric_columns : list[str], optional
102
+ Optionally select numeric columns to be returned within an array.
103
+
104
+ Returns
105
+ -------
106
+ Iterator[tuple[pa.Table, np.ndarray]]
107
+
108
+ """
109
+ _columns = set(columns) if columns else set()
110
+ _numeric_columns = set(numeric_columns) if numeric_columns else set()
111
+ columns = list(_columns.union(_numeric_columns))
112
+ for tbl in self.iterate_tables(
113
+ columns=columns,
114
+ filter=filter,
115
+ ):
116
+ table_columns = numeric_columns if numeric_columns else tbl.columns
117
+ yield tbl, np.column_stack(
118
+ [tbl[col].to_numpy() for col in table_columns]
119
+ )
120
+
121
+ def iterate_fragments(
122
+ self, batch_size: int
123
+ ) -> Iterator[Iterator[pa.RecordBatch]]:
124
+ """
125
+ Yield a table batch iterator.
126
+
127
+ This is intended to emulate file-based access patterns.
128
+
129
+ Parameters
130
+ ----------
131
+ batch_size : int
132
+ Maximum number of rows allowed to be read per batch.
133
+
134
+ Yields
135
+ ------
136
+ Iterator[Iterator[pa.RecordBatch]]
137
+ """
138
+ yield iter(self._table.to_batches(max_chunksize=batch_size))
139
+
140
+
141
+ class MemoryCacheWriter(MemoryCache):
142
+ def __init__(
143
+ self,
144
+ table: pa.Table,
145
+ batch_size: int,
146
+ ):
147
+ super().__init__(
148
+ table=table,
149
+ batch_size=batch_size,
150
+ )
151
+
152
+ # internal state
153
+ self._buffer = []
154
+
155
+ @classmethod
156
+ def create(
157
+ cls,
158
+ schema: pa.Schema,
159
+ batch_size: int,
160
+ ):
161
+ """
162
+ Create an in-memory cache.
163
+
164
+ Parameters
165
+ ----------
166
+ schema : pa.Schema
167
+ Cache schema.
168
+ batch_size : int
169
+ Target batch size when writing chunks.
170
+ """
171
+ return cls(
172
+ table=schema.empty_table(),
173
+ batch_size=batch_size,
174
+ )
175
+
176
+ def write_rows(
177
+ self,
178
+ rows: list[dict[str, Any]],
179
+ ):
180
+ """
181
+ Write rows to cache.
182
+
183
+ Parameters
184
+ ----------
185
+ rows : list[dict[str, Any]]
186
+ A list of rows represented by dictionaries mapping fields to values.
187
+ """
188
+ if not rows:
189
+ return
190
+ batch = pa.RecordBatch.from_pylist(rows, schema=self.schema)
191
+ self.write_batch(batch)
192
+
193
+ def write_columns(
194
+ self,
195
+ columns: dict[str, list | np.ndarray | pa.Array],
196
+ ):
197
+ """
198
+ Write columnar data to cache.
199
+
200
+ Parameters
201
+ ----------
202
+ columns : dict[str, list | np.ndarray | pa.Array]
203
+ A mapping of columnar field names to list of values.
204
+ """
205
+ if not columns:
206
+ return
207
+ batch = pa.RecordBatch.from_pydict(columns)
208
+ self.write_batch(batch)
209
+
210
+ def write_batch(
211
+ self,
212
+ batch: pa.RecordBatch,
213
+ ):
214
+ """
215
+ Write a batch to cache.
216
+
217
+ Parameters
218
+ ----------
219
+ batch : pa.RecordBatch
220
+ A batch of columnar data.
221
+ """
222
+ size = batch.num_rows
223
+ if self._buffer:
224
+ size += sum([b.num_rows for b in self._buffer])
225
+
226
+ # check size
227
+ if size < self._batch_size:
228
+ self._buffer.append(batch)
229
+ return
230
+
231
+ if self._buffer:
232
+ self._buffer.append(batch)
233
+ combined_arrays = [
234
+ pa.concat_arrays([b.column(name) for b in self._buffer])
235
+ for name in self.schema.names
236
+ ]
237
+ batch = pa.RecordBatch.from_arrays(
238
+ combined_arrays, schema=self.schema
239
+ )
240
+ self._buffer = []
241
+
242
+ # write batch
243
+ self.write_table(pa.Table.from_batches([batch]))
244
+
245
+ def write_table(
246
+ self,
247
+ table: pa.Table,
248
+ ):
249
+ """
250
+ Write a table directly to cache.
251
+
252
+ Parameters
253
+ ----------
254
+ table : pa.Table
255
+ A populated table.
256
+ """
257
+ self._table = pa.concat_tables([self._table, table])
258
+
259
+ def flush(self):
260
+ """Flush the cache buffer."""
261
+ if self._buffer:
262
+ combined_arrays = [
263
+ pa.concat_arrays([b.column(name) for b in self._buffer])
264
+ for name in self.schema.names
265
+ ]
266
+ batch = pa.RecordBatch.from_arrays(
267
+ combined_arrays, schema=self.schema
268
+ )
269
+ self._table = pa.concat_tables(
270
+ [self._table, pa.Table.from_batches([batch])]
271
+ )
272
+ self._buffer = []
273
+
274
+ def sort_by(
275
+ self,
276
+ sorting: list[tuple[str, str]],
277
+ ):
278
+ """
279
+ Sort cache in-place.
280
+
281
+ Parameters
282
+ ----------
283
+ sorting : list[tuple[str, str]]
284
+ Sorting arguments in PyArrow format (e.g. [('a', 'ascending'), ('b', 'descending')]).
285
+ """
286
+ self.flush()
287
+ self._table = self._table.sort_by(sorting)
288
+
289
+ def __enter__(self):
290
+ """Context manager entry."""
291
+ return self
292
+
293
+ def __exit__(self, exc_type, exc_val, exc_tb):
294
+ """Context manager exit - ensures data is flushed."""
295
+ self.flush()
296
+
297
+ def to_reader(self) -> MemoryCacheReader:
298
+ """Get cache reader."""
299
+ self.flush()
300
+ return MemoryCacheReader(
301
+ table=self._table, batch_size=self._batch_size
302
+ )