valor-lite 0.36.6__py3-none-any.whl → 0.37.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. valor_lite/cache/__init__.py +11 -0
  2. valor_lite/cache/compute.py +211 -0
  3. valor_lite/cache/ephemeral.py +302 -0
  4. valor_lite/cache/persistent.py +536 -0
  5. valor_lite/classification/__init__.py +5 -10
  6. valor_lite/classification/annotation.py +4 -0
  7. valor_lite/classification/computation.py +233 -251
  8. valor_lite/classification/evaluator.py +882 -0
  9. valor_lite/classification/loader.py +97 -0
  10. valor_lite/classification/metric.py +141 -4
  11. valor_lite/classification/shared.py +184 -0
  12. valor_lite/classification/utilities.py +221 -118
  13. valor_lite/exceptions.py +5 -0
  14. valor_lite/object_detection/__init__.py +5 -4
  15. valor_lite/object_detection/annotation.py +13 -1
  16. valor_lite/object_detection/computation.py +368 -299
  17. valor_lite/object_detection/evaluator.py +804 -0
  18. valor_lite/object_detection/loader.py +292 -0
  19. valor_lite/object_detection/metric.py +152 -3
  20. valor_lite/object_detection/shared.py +206 -0
  21. valor_lite/object_detection/utilities.py +182 -100
  22. valor_lite/semantic_segmentation/__init__.py +5 -4
  23. valor_lite/semantic_segmentation/annotation.py +7 -0
  24. valor_lite/semantic_segmentation/computation.py +20 -110
  25. valor_lite/semantic_segmentation/evaluator.py +414 -0
  26. valor_lite/semantic_segmentation/loader.py +205 -0
  27. valor_lite/semantic_segmentation/shared.py +149 -0
  28. valor_lite/semantic_segmentation/utilities.py +6 -23
  29. {valor_lite-0.36.6.dist-info → valor_lite-0.37.5.dist-info}/METADATA +3 -1
  30. valor_lite-0.37.5.dist-info/RECORD +49 -0
  31. {valor_lite-0.36.6.dist-info → valor_lite-0.37.5.dist-info}/WHEEL +1 -1
  32. valor_lite/classification/manager.py +0 -545
  33. valor_lite/object_detection/manager.py +0 -864
  34. valor_lite/profiling.py +0 -374
  35. valor_lite/semantic_segmentation/benchmark.py +0 -237
  36. valor_lite/semantic_segmentation/manager.py +0 -446
  37. valor_lite-0.36.6.dist-info/RECORD +0 -41
  38. {valor_lite-0.36.6.dist-info → valor_lite-0.37.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,11 @@
1
+ from .compute import sort
2
+ from .ephemeral import MemoryCacheReader, MemoryCacheWriter
3
+ from .persistent import FileCacheReader, FileCacheWriter
4
+
5
+ __all__ = [
6
+ "FileCacheReader",
7
+ "FileCacheWriter",
8
+ "MemoryCacheReader",
9
+ "MemoryCacheWriter",
10
+ "sort",
11
+ ]
@@ -0,0 +1,211 @@
1
+ import heapq
2
+ import tempfile
3
+ from pathlib import Path
4
+ from typing import Callable, Generator
5
+
6
+ import pyarrow as pa
7
+ import pyarrow.compute as pc
8
+
9
+ from valor_lite.cache.ephemeral import MemoryCacheReader, MemoryCacheWriter
10
+ from valor_lite.cache.persistent import FileCacheReader, FileCacheWriter
11
+
12
+
13
+ def _merge(
14
+ source: MemoryCacheReader | FileCacheReader,
15
+ sink: MemoryCacheWriter | FileCacheWriter,
16
+ intermediate_sink: MemoryCacheWriter | FileCacheWriter,
17
+ batch_size: int,
18
+ sorting: list[tuple[str, str]],
19
+ columns: list[str] | None = None,
20
+ table_sort_override: Callable[[pa.Table], pa.Table] | None = None,
21
+ ):
22
+ """Merge locally sorted cache fragments."""
23
+ for tbl in source.iterate_tables(columns=columns):
24
+ if table_sort_override is not None:
25
+ sorted_tbl = table_sort_override(tbl)
26
+ else:
27
+ sorted_tbl = tbl.sort_by(sorting)
28
+ intermediate_sink.write_table(sorted_tbl)
29
+ intermediate_source = intermediate_sink.to_reader()
30
+
31
+ # define merge key
32
+ def create_sort_key(
33
+ batches: list[pa.RecordBatch],
34
+ batch_idx: int,
35
+ row_idx: int,
36
+ ):
37
+ args = [
38
+ -batches[batch_idx][name][row_idx].as_py()
39
+ if direction == "descending"
40
+ else batches[batch_idx][name][row_idx].as_py()
41
+ for name, direction in sorting
42
+ ]
43
+ return (
44
+ *args,
45
+ batch_idx,
46
+ row_idx,
47
+ )
48
+
49
+ # merge sorted rows
50
+ heap = []
51
+ batch_iterators = []
52
+ batches = []
53
+ for batch_idx, batch_iter in enumerate(
54
+ intermediate_source.iterate_fragment_batch_iterators(
55
+ batch_size=batch_size
56
+ )
57
+ ):
58
+ batch_iterators.append(batch_iter)
59
+ batches.append(next(batch_iterators[batch_idx], None))
60
+ if batches[batch_idx] is not None and len(batches[batch_idx]) > 0:
61
+ heap.append(create_sort_key(batches, batch_idx, 0))
62
+ heapq.heapify(heap)
63
+
64
+ while heap:
65
+ row = heapq.heappop(heap)
66
+ batch_idx = row[-2]
67
+ row_idx = row[-1]
68
+ row_table = batches[batch_idx].slice(row_idx, 1)
69
+ sink.write_batch(row_table)
70
+ row_idx += 1
71
+ if row_idx < len(batches[batch_idx]):
72
+ heapq.heappush(
73
+ heap,
74
+ create_sort_key(batches, batch_idx, row_idx),
75
+ )
76
+ else:
77
+ batches[batch_idx] = next(batch_iterators[batch_idx], None)
78
+ if batches[batch_idx] is not None and len(batches[batch_idx]) > 0:
79
+ heapq.heappush(
80
+ heap,
81
+ create_sort_key(batches, batch_idx, 0),
82
+ )
83
+
84
+ sink.flush()
85
+
86
+
87
+ def sort(
88
+ source: MemoryCacheReader | FileCacheReader,
89
+ sink: MemoryCacheWriter | FileCacheWriter,
90
+ batch_size: int,
91
+ sorting: list[tuple[str, str]],
92
+ columns: list[str] | None = None,
93
+ table_sort_override: Callable[[pa.Table], pa.Table] | None = None,
94
+ ):
95
+ """
96
+ Sort data into new cache.
97
+
98
+ Parameters
99
+ ----------
100
+ source : MemoryCacheReader | FileCacheReader
101
+ A read-only cache. If file-based, each file must be locally sorted.
102
+ sink : MemoryCacheWriter | FileCacheWriter
103
+ The cache where sorted data will be written.
104
+ batch_size : int
105
+ Maximum number of rows allowed to be read into memory per cache file.
106
+ sorting : list[tuple[str, str]]
107
+ Sorting arguments in PyArrow format (e.g. [('a', 'ascending'), ('b', 'descending')]).
108
+ Note that only numeric fields are supported.
109
+ columns : list[str], optional
110
+ Option to only read a subset of columns.
111
+ table_sort_override : Callable[[pa.Table], pa.Table], optional
112
+ Option to override sort function for singular cache fragments.
113
+ """
114
+
115
+ if source.count_tables() == 1:
116
+ for tbl in source.iterate_tables(columns=columns):
117
+ if table_sort_override is not None:
118
+ sorted_tbl = table_sort_override(tbl)
119
+ else:
120
+ sorted_tbl = tbl.sort_by(sorting)
121
+ sink.write_table(sorted_tbl)
122
+ sink.flush()
123
+ return
124
+
125
+ if isinstance(sink, FileCacheWriter):
126
+ with tempfile.TemporaryDirectory() as tmpdir:
127
+ intermediate_sink = FileCacheWriter.create(
128
+ path=Path(tmpdir) / "sorting_intermediate",
129
+ schema=sink.schema,
130
+ batch_size=sink.batch_size,
131
+ rows_per_file=sink.rows_per_file,
132
+ compression=sink.compression,
133
+ delete_if_exists=False,
134
+ )
135
+ _merge(
136
+ source=source,
137
+ sink=sink,
138
+ intermediate_sink=intermediate_sink,
139
+ batch_size=batch_size,
140
+ sorting=sorting,
141
+ columns=columns,
142
+ table_sort_override=table_sort_override,
143
+ )
144
+ else:
145
+ intermediate_sink = MemoryCacheWriter.create(
146
+ schema=sink.schema,
147
+ batch_size=sink.batch_size,
148
+ )
149
+ _merge(
150
+ source=source,
151
+ sink=sink,
152
+ intermediate_sink=intermediate_sink,
153
+ batch_size=batch_size,
154
+ sorting=sorting,
155
+ columns=columns,
156
+ table_sort_override=table_sort_override,
157
+ )
158
+
159
+
160
+ def paginate_index(
161
+ source: MemoryCacheReader | FileCacheReader,
162
+ column_key: str,
163
+ modifier: pc.Expression | None = None,
164
+ limit: int | None = None,
165
+ offset: int = 0,
166
+ ) -> Generator[pa.Table, None, None]:
167
+ """
168
+ Iterate through a paginated cache reader.
169
+
170
+ Note this function expects unqiue keys to be fragment-aligned.
171
+ """
172
+ total = source.count_rows()
173
+ limit = limit if limit else total
174
+
175
+ # pagination broader than data scope
176
+ if offset == 0 and limit >= total:
177
+ for tbl in source.iterate_tables(filter=modifier):
178
+ yield tbl
179
+ return
180
+ elif offset >= total:
181
+ return
182
+
183
+ curr_idx = 0
184
+ for tbl in source.iterate_tables(filter=modifier):
185
+ if tbl.num_rows == 0:
186
+ continue
187
+
188
+ # sort the unique keys as they may be out of order
189
+ unique_values = pc.unique(tbl[column_key]).sort() # type: ignore[reportAttributeAccessIssue]
190
+ n_unique = len(unique_values)
191
+ prev_idx = curr_idx
192
+ curr_idx += n_unique
193
+
194
+ # check for page overlap
195
+ if curr_idx <= offset:
196
+ continue
197
+ elif prev_idx >= (offset + limit):
198
+ return
199
+
200
+ # apply any pagination conditions
201
+ condition = pc.scalar(True)
202
+ if prev_idx < offset and curr_idx > offset:
203
+ condition &= (
204
+ pc.field(column_key) >= unique_values[offset - prev_idx]
205
+ )
206
+ if prev_idx < (offset + limit) and curr_idx > (offset + limit):
207
+ condition &= (
208
+ pc.field(column_key) < unique_values[offset + limit - prev_idx]
209
+ )
210
+
211
+ yield tbl.filter(condition)
@@ -0,0 +1,302 @@
1
+ from collections.abc import Iterator
2
+ from typing import Any
3
+
4
+ import numpy as np
5
+ import pyarrow as pa
6
+ import pyarrow.compute as pc
7
+
8
+
9
+ class MemoryCache:
10
+ def __init__(
11
+ self,
12
+ table: pa.Table,
13
+ batch_size: int,
14
+ ):
15
+ self._table = table
16
+ self._batch_size = batch_size
17
+
18
+ @property
19
+ def schema(self) -> pa.Schema:
20
+ return self._table.schema
21
+
22
+ @property
23
+ def batch_size(self) -> int:
24
+ return self._batch_size
25
+
26
+ def count_tables(self) -> int:
27
+ """Count the number of tables in the cache."""
28
+ return 1
29
+
30
+ def count_rows(self) -> int:
31
+ """Count the number of rows in the cache."""
32
+ return self._table.num_rows
33
+
34
+
35
+ class MemoryCacheReader(MemoryCache):
36
+ def iterate_tables(
37
+ self,
38
+ columns: list[str] | None = None,
39
+ filter: pc.Expression | None = None,
40
+ ) -> Iterator[pa.Table]:
41
+ """
42
+ Iterate over tables within the cache.
43
+
44
+ Parameters
45
+ ----------
46
+ columns : list[str], optional
47
+ Optionally select columns to be returned.
48
+ filter : pyarrow.compute.Expression, optional
49
+ Optionally filter table before returning.
50
+
51
+ Returns
52
+ -------
53
+ Iterator[pa.Table]
54
+ """
55
+ table = self._table
56
+ if filter is not None:
57
+ table = table.filter(filter)
58
+ if columns is not None:
59
+ table = table.select(columns)
60
+ yield table
61
+
62
+ def iterate_arrays(
63
+ self,
64
+ numeric_columns: list[str] | None = None,
65
+ filter: pc.Expression | None = None,
66
+ ) -> Iterator[np.ndarray]:
67
+ """
68
+ Iterate over chunks within the cache returning arrays.
69
+
70
+ Parameters
71
+ ----------
72
+ numeric_columns : list[str], optional
73
+ Optionally select numeric columns to be returned within an array.
74
+ filter : pyarrow.compute.Expression, optional
75
+ Optionally filter table before returning.
76
+
77
+ Returns
78
+ -------
79
+ Iterator[np.ndarray]
80
+ """
81
+ for tbl in self.iterate_tables(columns=numeric_columns, filter=filter):
82
+ yield np.column_stack(
83
+ [tbl.column(i).to_numpy() for i in range(tbl.num_columns)]
84
+ )
85
+
86
+ def iterate_tables_with_arrays(
87
+ self,
88
+ columns: list[str] | None = None,
89
+ filter: pc.Expression | None = None,
90
+ numeric_columns: list[str] | None = None,
91
+ ) -> Iterator[tuple[pa.Table, np.ndarray]]:
92
+ """
93
+ Iterate over chunks within the cache returning both tables and arrays.
94
+
95
+ Parameters
96
+ ----------
97
+ columns : list[str], optional
98
+ Optionally select columns to be returned.
99
+ filter : pyarrow.compute.Expression, optional
100
+ Optionally filter table before returning.
101
+ numeric_columns : list[str], optional
102
+ Optionally select numeric columns to be returned within an array.
103
+
104
+ Returns
105
+ -------
106
+ Iterator[tuple[pa.Table, np.ndarray]]
107
+
108
+ """
109
+ _columns = set(columns) if columns else set()
110
+ _numeric_columns = set(numeric_columns) if numeric_columns else set()
111
+ columns = list(_columns.union(_numeric_columns))
112
+ for tbl in self.iterate_tables(
113
+ columns=columns,
114
+ filter=filter,
115
+ ):
116
+ table_columns = numeric_columns if numeric_columns else tbl.columns
117
+ yield tbl, np.column_stack(
118
+ [tbl[col].to_numpy() for col in table_columns]
119
+ )
120
+
121
+ def iterate_fragment_batch_iterators(
122
+ self, batch_size: int
123
+ ) -> Iterator[Iterator[pa.RecordBatch]]:
124
+ """
125
+ Yield a table batch iterator.
126
+
127
+ This is intended to emulate file-based access patterns.
128
+
129
+ Parameters
130
+ ----------
131
+ batch_size : int
132
+ Maximum number of rows allowed to be read per batch.
133
+
134
+ Yields
135
+ ------
136
+ Iterator[Iterator[pa.RecordBatch]]
137
+ """
138
+ yield iter(self._table.to_batches(max_chunksize=batch_size))
139
+
140
+
141
+ class MemoryCacheWriter(MemoryCache):
142
+ def __init__(
143
+ self,
144
+ table: pa.Table,
145
+ batch_size: int,
146
+ ):
147
+ super().__init__(
148
+ table=table,
149
+ batch_size=batch_size,
150
+ )
151
+
152
+ # internal state
153
+ self._buffer = []
154
+
155
+ @classmethod
156
+ def create(
157
+ cls,
158
+ schema: pa.Schema,
159
+ batch_size: int,
160
+ ):
161
+ """
162
+ Create an in-memory cache.
163
+
164
+ Parameters
165
+ ----------
166
+ schema : pa.Schema
167
+ Cache schema.
168
+ batch_size : int
169
+ Target batch size when writing chunks.
170
+ """
171
+ return cls(
172
+ table=schema.empty_table(),
173
+ batch_size=batch_size,
174
+ )
175
+
176
+ def write_rows(
177
+ self,
178
+ rows: list[dict[str, Any]],
179
+ ):
180
+ """
181
+ Write rows to cache.
182
+
183
+ Parameters
184
+ ----------
185
+ rows : list[dict[str, Any]]
186
+ A list of rows represented by dictionaries mapping fields to values.
187
+ """
188
+ if not rows:
189
+ return
190
+ batch = pa.RecordBatch.from_pylist(rows, schema=self.schema)
191
+ self.write_batch(batch)
192
+
193
+ def write_columns(
194
+ self,
195
+ columns: dict[str, list | np.ndarray | pa.Array],
196
+ ):
197
+ """
198
+ Write columnar data to cache.
199
+
200
+ Parameters
201
+ ----------
202
+ columns : dict[str, list | np.ndarray | pa.Array]
203
+ A mapping of columnar field names to list of values.
204
+ """
205
+ if not columns:
206
+ return
207
+ batch = pa.RecordBatch.from_pydict(columns)
208
+ self.write_batch(batch)
209
+
210
+ def write_batch(
211
+ self,
212
+ batch: pa.RecordBatch,
213
+ ):
214
+ """
215
+ Write a batch to cache.
216
+
217
+ Parameters
218
+ ----------
219
+ batch : pa.RecordBatch
220
+ A batch of columnar data.
221
+ """
222
+ size = batch.num_rows
223
+ if self._buffer:
224
+ size += sum([b.num_rows for b in self._buffer])
225
+
226
+ # check size
227
+ if size < self._batch_size:
228
+ self._buffer.append(batch)
229
+ return
230
+
231
+ if self._buffer:
232
+ self._buffer.append(batch)
233
+ combined_arrays = [
234
+ pa.concat_arrays([b.column(name) for b in self._buffer])
235
+ for name in self.schema.names
236
+ ]
237
+ batch = pa.RecordBatch.from_arrays(
238
+ combined_arrays, schema=self.schema
239
+ )
240
+ self._buffer = []
241
+
242
+ # write batch
243
+ self.write_table(pa.Table.from_batches([batch]))
244
+
245
+ def write_table(
246
+ self,
247
+ table: pa.Table,
248
+ ):
249
+ """
250
+ Write a table directly to cache.
251
+
252
+ Parameters
253
+ ----------
254
+ table : pa.Table
255
+ A populated table.
256
+ """
257
+ self._table = pa.concat_tables([self._table, table])
258
+
259
+ def flush(self):
260
+ """Flush the cache buffer."""
261
+ if self._buffer:
262
+ combined_arrays = [
263
+ pa.concat_arrays([b.column(name) for b in self._buffer])
264
+ for name in self.schema.names
265
+ ]
266
+ batch = pa.RecordBatch.from_arrays(
267
+ combined_arrays, schema=self.schema
268
+ )
269
+ self._table = pa.concat_tables(
270
+ [self._table, pa.Table.from_batches([batch])]
271
+ )
272
+ self._buffer = []
273
+
274
+ def sort_by(
275
+ self,
276
+ sorting: list[tuple[str, str]],
277
+ ):
278
+ """
279
+ Sort cache in-place.
280
+
281
+ Parameters
282
+ ----------
283
+ sorting : list[tuple[str, str]]
284
+ Sorting arguments in PyArrow format (e.g. [('a', 'ascending'), ('b', 'descending')]).
285
+ """
286
+ self.flush()
287
+ self._table = self._table.sort_by(sorting)
288
+
289
+ def __enter__(self):
290
+ """Context manager entry."""
291
+ return self
292
+
293
+ def __exit__(self, exc_type, exc_val, exc_tb):
294
+ """Context manager exit - ensures data is flushed."""
295
+ self.flush()
296
+
297
+ def to_reader(self) -> MemoryCacheReader:
298
+ """Get cache reader."""
299
+ self.flush()
300
+ return MemoryCacheReader(
301
+ table=self._table, batch_size=self._batch_size
302
+ )