torchcodec 0.3.0__cp313-cp313t-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/__init__.py +16 -0
- torchcodec/_core/AVIOBytesContext.cpp +70 -0
- torchcodec/_core/AVIOBytesContext.h +32 -0
- torchcodec/_core/AVIOContextHolder.cpp +50 -0
- torchcodec/_core/AVIOContextHolder.h +65 -0
- torchcodec/_core/AVIOFileLikeContext.cpp +80 -0
- torchcodec/_core/AVIOFileLikeContext.h +54 -0
- torchcodec/_core/CMakeLists.txt +237 -0
- torchcodec/_core/CudaDeviceInterface.cpp +289 -0
- torchcodec/_core/CudaDeviceInterface.h +34 -0
- torchcodec/_core/DeviceInterface.cpp +88 -0
- torchcodec/_core/DeviceInterface.h +66 -0
- torchcodec/_core/Encoder.cpp +319 -0
- torchcodec/_core/Encoder.h +39 -0
- torchcodec/_core/FFMPEGCommon.cpp +264 -0
- torchcodec/_core/FFMPEGCommon.h +180 -0
- torchcodec/_core/Frame.h +47 -0
- torchcodec/_core/Metadata.h +70 -0
- torchcodec/_core/SingleStreamDecoder.cpp +1947 -0
- torchcodec/_core/SingleStreamDecoder.h +462 -0
- torchcodec/_core/StreamOptions.h +49 -0
- torchcodec/_core/__init__.py +39 -0
- torchcodec/_core/_metadata.py +277 -0
- torchcodec/_core/custom_ops.cpp +681 -0
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +226 -0
- torchcodec/_core/ops.py +381 -0
- torchcodec/_core/pybind_ops.cpp +45 -0
- torchcodec/_frame.py +145 -0
- torchcodec/_internally_replaced_utils.py +53 -0
- torchcodec/_samplers/__init__.py +7 -0
- torchcodec/_samplers/video_clip_sampler.py +430 -0
- torchcodec/decoders/__init__.py +11 -0
- torchcodec/decoders/_audio_decoder.py +168 -0
- torchcodec/decoders/_decoder_utils.py +52 -0
- torchcodec/decoders/_video_decoder.py +399 -0
- torchcodec/libtorchcodec_custom_ops4.so +0 -0
- torchcodec/libtorchcodec_custom_ops5.so +0 -0
- torchcodec/libtorchcodec_custom_ops6.so +0 -0
- torchcodec/libtorchcodec_custom_ops7.so +0 -0
- torchcodec/libtorchcodec_decoder4.so +0 -0
- torchcodec/libtorchcodec_decoder5.so +0 -0
- torchcodec/libtorchcodec_decoder6.so +0 -0
- torchcodec/libtorchcodec_decoder7.so +0 -0
- torchcodec/libtorchcodec_pybind_ops4.so +0 -0
- torchcodec/libtorchcodec_pybind_ops5.so +0 -0
- torchcodec/libtorchcodec_pybind_ops6.so +0 -0
- torchcodec/libtorchcodec_pybind_ops7.so +0 -0
- torchcodec/samplers/__init__.py +2 -0
- torchcodec/samplers/_common.py +84 -0
- torchcodec/samplers/_index_based.py +285 -0
- torchcodec/samplers/_time_based.py +348 -0
- torchcodec/version.py +2 -0
- torchcodec-0.3.0.dist-info/LICENSE +28 -0
- torchcodec-0.3.0.dist-info/METADATA +280 -0
- torchcodec-0.3.0.dist-info/RECORD +57 -0
- torchcodec-0.3.0.dist-info/WHEEL +5 -0
- torchcodec-0.3.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
// All rights reserved.
|
|
3
|
+
//
|
|
4
|
+
// This source code is licensed under the BSD-style license found in the
|
|
5
|
+
// LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
#pragma once
|
|
8
|
+
|
|
9
|
+
#include <torch/types.h>
|
|
10
|
+
#include <cstdint>
|
|
11
|
+
#include <memory>
|
|
12
|
+
#include <ostream>
|
|
13
|
+
#include <string_view>
|
|
14
|
+
|
|
15
|
+
#include "src/torchcodec/_core/AVIOContextHolder.h"
|
|
16
|
+
#include "src/torchcodec/_core/DeviceInterface.h"
|
|
17
|
+
#include "src/torchcodec/_core/FFMPEGCommon.h"
|
|
18
|
+
#include "src/torchcodec/_core/Frame.h"
|
|
19
|
+
#include "src/torchcodec/_core/StreamOptions.h"
|
|
20
|
+
|
|
21
|
+
namespace facebook::torchcodec {
|
|
22
|
+
|
|
23
|
+
// The SingleStreamDecoder class can be used to decode video frames to Tensors.
|
|
24
|
+
// Note that SingleStreamDecoder is not thread-safe.
|
|
25
|
+
// Do not call non-const APIs concurrently on the same object.
|
|
26
|
+
class SingleStreamDecoder {
|
|
27
|
+
public:
|
|
28
|
+
// --------------------------------------------------------------------------
|
|
29
|
+
// CONSTRUCTION API
|
|
30
|
+
// --------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
enum class SeekMode { exact, approximate };
|
|
33
|
+
|
|
34
|
+
// Creates a SingleStreamDecoder from the video at videoFilePath.
|
|
35
|
+
explicit SingleStreamDecoder(
|
|
36
|
+
const std::string& videoFilePath,
|
|
37
|
+
SeekMode seekMode = SeekMode::exact);
|
|
38
|
+
|
|
39
|
+
// Creates a SingleStreamDecoder using the provided AVIOContext inside the
|
|
40
|
+
// AVIOContextHolder. The AVIOContextHolder is the base class, and the
|
|
41
|
+
// derived class will have specialized how the custom read, seek and writes
|
|
42
|
+
// work.
|
|
43
|
+
explicit SingleStreamDecoder(
|
|
44
|
+
std::unique_ptr<AVIOContextHolder> context,
|
|
45
|
+
SeekMode seekMode = SeekMode::exact);
|
|
46
|
+
|
|
47
|
+
// --------------------------------------------------------------------------
|
|
48
|
+
// VIDEO METADATA QUERY API
|
|
49
|
+
// --------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
// Updates the metadata of the video to accurate values obtained by scanning
|
|
52
|
+
// the contents of the video file. Also updates each StreamInfo's index, i.e.
|
|
53
|
+
// the allFrames and keyFrames vectors.
|
|
54
|
+
void scanFileAndUpdateMetadataAndIndex();
|
|
55
|
+
|
|
56
|
+
// Returns the metadata for the container.
|
|
57
|
+
ContainerMetadata getContainerMetadata() const;
|
|
58
|
+
|
|
59
|
+
// Returns the key frame indices as a tensor. The tensor is 1D and contains
|
|
60
|
+
// int64 values, where each value is the frame index for a key frame.
|
|
61
|
+
torch::Tensor getKeyFrameIndices();
|
|
62
|
+
|
|
63
|
+
// --------------------------------------------------------------------------
|
|
64
|
+
// ADDING STREAMS API
|
|
65
|
+
// --------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
void addVideoStream(
|
|
68
|
+
int streamIndex,
|
|
69
|
+
const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
|
|
70
|
+
void addAudioStream(
|
|
71
|
+
int streamIndex,
|
|
72
|
+
const AudioStreamOptions& audioStreamOptions = AudioStreamOptions());
|
|
73
|
+
|
|
74
|
+
// --------------------------------------------------------------------------
|
|
75
|
+
// DECODING AND SEEKING APIs
|
|
76
|
+
// --------------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
// Places the cursor at the first frame on or after the position in seconds.
|
|
79
|
+
// Calling getNextFrame() will return the first frame at
|
|
80
|
+
// or after this position.
|
|
81
|
+
void setCursorPtsInSeconds(double seconds);
|
|
82
|
+
|
|
83
|
+
// Decodes the frame where the current cursor position is. It also advances
|
|
84
|
+
// the cursor to the next frame.
|
|
85
|
+
FrameOutput getNextFrame();
|
|
86
|
+
|
|
87
|
+
FrameOutput getFrameAtIndex(int64_t frameIndex);
|
|
88
|
+
|
|
89
|
+
// Returns frames at the given indices for a given stream as a single stacked
|
|
90
|
+
// Tensor.
|
|
91
|
+
FrameBatchOutput getFramesAtIndices(const std::vector<int64_t>& frameIndices);
|
|
92
|
+
|
|
93
|
+
// Returns frames within a given range. The range is defined by [start, stop).
|
|
94
|
+
// The values retrieved from the range are: [start, start+step,
|
|
95
|
+
// start+(2*step), start+(3*step), ..., stop). The default for step is 1.
|
|
96
|
+
FrameBatchOutput getFramesInRange(int64_t start, int64_t stop, int64_t step);
|
|
97
|
+
|
|
98
|
+
// Decodes the first frame in any added stream that is visible at a given
|
|
99
|
+
// timestamp. Frames in the video have a presentation timestamp and a
|
|
100
|
+
// duration. For example, if a frame has presentation timestamp of 5.0s and a
|
|
101
|
+
// duration of 1.0s, it will be visible in the timestamp range [5.0, 6.0).
|
|
102
|
+
// i.e. it will be returned when this function is called with seconds=5.0 or
|
|
103
|
+
// seconds=5.999, etc.
|
|
104
|
+
FrameOutput getFramePlayedAt(double seconds);
|
|
105
|
+
|
|
106
|
+
FrameBatchOutput getFramesPlayedAt(const std::vector<double>& timestamps);
|
|
107
|
+
|
|
108
|
+
// Returns frames within a given pts range. The range is defined by
|
|
109
|
+
// [startSeconds, stopSeconds) with respect to the pts values for frames. The
|
|
110
|
+
// returned frames are in pts order.
|
|
111
|
+
//
|
|
112
|
+
// Note that while stopSeconds is excluded in the half open range, this really
|
|
113
|
+
// only makes a difference when stopSeconds is exactly the pts value for a
|
|
114
|
+
// frame. Otherwise, the moment in time immediately before stopSeconds is in
|
|
115
|
+
// the range, and that time maps to the same frame as stopSeconds.
|
|
116
|
+
//
|
|
117
|
+
// The frames returned are the frames that would be played by our abstract
|
|
118
|
+
// player. Our abstract player displays frames based on pts only. It displays
|
|
119
|
+
// frame i starting at the pts for frame i, and stops at the pts for frame
|
|
120
|
+
// i+1. This model ignores a frame's reported duration.
|
|
121
|
+
//
|
|
122
|
+
// Valid values for startSeconds and stopSeconds are:
|
|
123
|
+
//
|
|
124
|
+
// [minPtsSecondsFromScan, maxPtsSecondsFromScan)
|
|
125
|
+
FrameBatchOutput getFramesPlayedInRange(
|
|
126
|
+
double startSeconds,
|
|
127
|
+
double stopSeconds);
|
|
128
|
+
|
|
129
|
+
AudioFramesOutput getFramesPlayedInRangeAudio(
|
|
130
|
+
double startSeconds,
|
|
131
|
+
std::optional<double> stopSecondsOptional = std::nullopt);
|
|
132
|
+
|
|
133
|
+
class EndOfFileException : public std::runtime_error {
|
|
134
|
+
public:
|
|
135
|
+
explicit EndOfFileException(const std::string& msg)
|
|
136
|
+
: std::runtime_error(msg) {}
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
// --------------------------------------------------------------------------
|
|
140
|
+
// MORALLY PRIVATE APIS
|
|
141
|
+
// --------------------------------------------------------------------------
|
|
142
|
+
// These are APIs that should be private, but that are effectively exposed for
|
|
143
|
+
// practical reasons, typically for testing purposes.
|
|
144
|
+
|
|
145
|
+
// Once getFrameAtIndex supports the preAllocatedOutputTensor parameter, we
|
|
146
|
+
// can move it back to private.
|
|
147
|
+
FrameOutput getFrameAtIndexInternal(
|
|
148
|
+
int64_t frameIndex,
|
|
149
|
+
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
|
|
150
|
+
|
|
151
|
+
// Exposed for _test_frame_pts_equality, which is used to test non-regression
|
|
152
|
+
// of pts resolution (64 to 32 bit floats)
|
|
153
|
+
double getPtsSecondsForFrame(int64_t frameIndex);
|
|
154
|
+
|
|
155
|
+
// Exposed for performance testing.
|
|
156
|
+
struct DecodeStats {
|
|
157
|
+
int64_t numSeeksAttempted = 0;
|
|
158
|
+
int64_t numSeeksDone = 0;
|
|
159
|
+
int64_t numSeeksSkipped = 0;
|
|
160
|
+
int64_t numPacketsRead = 0;
|
|
161
|
+
int64_t numPacketsSentToDecoder = 0;
|
|
162
|
+
int64_t numFramesReceivedByDecoder = 0;
|
|
163
|
+
int64_t numFlushes = 0;
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
DecodeStats getDecodeStats() const;
|
|
167
|
+
void resetDecodeStats();
|
|
168
|
+
|
|
169
|
+
private:
|
|
170
|
+
// --------------------------------------------------------------------------
|
|
171
|
+
// STREAMINFO AND ASSOCIATED STRUCTS
|
|
172
|
+
// --------------------------------------------------------------------------
|
|
173
|
+
|
|
174
|
+
struct FrameInfo {
|
|
175
|
+
int64_t pts = 0;
|
|
176
|
+
|
|
177
|
+
// The value of the nextPts default is important: the last frame's nextPts
|
|
178
|
+
// will be INT64_MAX, which ensures that the allFrames vec contains
|
|
179
|
+
// FrameInfo structs with *increasing* nextPts values. That's a necessary
|
|
180
|
+
// condition for the binary searches on those values to work properly (as
|
|
181
|
+
// typically done during pts -> index conversions).
|
|
182
|
+
// TODO: This field is unset (left to the default) for entries in the
|
|
183
|
+
// keyFrames vec!
|
|
184
|
+
int64_t nextPts = INT64_MAX;
|
|
185
|
+
|
|
186
|
+
// Note that frameIndex is ALWAYS the index into all of the frames in that
|
|
187
|
+
// stream, even when the FrameInfo is part of the key frame index. Given a
|
|
188
|
+
// FrameInfo for a key frame, the frameIndex allows us to know which frame
|
|
189
|
+
// that is in the stream.
|
|
190
|
+
int64_t frameIndex = 0;
|
|
191
|
+
|
|
192
|
+
// Indicates whether a frame is a key frame. It may appear redundant as it's
|
|
193
|
+
// only true for FrameInfos in the keyFrames index, but it is needed to
|
|
194
|
+
// correctly map frames between allFrames and keyFrames during the scan.
|
|
195
|
+
bool isKeyFrame = false;
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
struct FilterGraphContext {
|
|
199
|
+
UniqueAVFilterGraph filterGraph;
|
|
200
|
+
AVFilterContext* sourceContext = nullptr;
|
|
201
|
+
AVFilterContext* sinkContext = nullptr;
|
|
202
|
+
};
|
|
203
|
+
|
|
204
|
+
struct DecodedFrameContext {
|
|
205
|
+
int decodedWidth;
|
|
206
|
+
int decodedHeight;
|
|
207
|
+
AVPixelFormat decodedFormat;
|
|
208
|
+
int expectedWidth;
|
|
209
|
+
int expectedHeight;
|
|
210
|
+
bool operator==(const DecodedFrameContext&);
|
|
211
|
+
bool operator!=(const DecodedFrameContext&);
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
struct StreamInfo {
|
|
215
|
+
int streamIndex = -1;
|
|
216
|
+
AVStream* stream = nullptr;
|
|
217
|
+
AVMediaType avMediaType = AVMEDIA_TYPE_UNKNOWN;
|
|
218
|
+
|
|
219
|
+
AVRational timeBase = {};
|
|
220
|
+
UniqueAVCodecContext codecContext;
|
|
221
|
+
|
|
222
|
+
// The FrameInfo indices we built when scanFileAndUpdateMetadataAndIndex was
|
|
223
|
+
// called.
|
|
224
|
+
std::vector<FrameInfo> keyFrames;
|
|
225
|
+
std::vector<FrameInfo> allFrames;
|
|
226
|
+
|
|
227
|
+
// TODO since the decoder is single-stream, these should be decoder fields,
|
|
228
|
+
// not streamInfo fields. And they should be defined right next to
|
|
229
|
+
// `cursor_`, with joint documentation.
|
|
230
|
+
int64_t lastDecodedAvFramePts = 0;
|
|
231
|
+
int64_t lastDecodedAvFrameDuration = 0;
|
|
232
|
+
VideoStreamOptions videoStreamOptions;
|
|
233
|
+
AudioStreamOptions audioStreamOptions;
|
|
234
|
+
|
|
235
|
+
// color-conversion fields. Only one of FilterGraphContext and
|
|
236
|
+
// UniqueSwsContext should be non-null.
|
|
237
|
+
FilterGraphContext filterGraphContext;
|
|
238
|
+
ColorConversionLibrary colorConversionLibrary = FILTERGRAPH;
|
|
239
|
+
UniqueSwsContext swsContext;
|
|
240
|
+
UniqueSwrContext swrContext;
|
|
241
|
+
|
|
242
|
+
// Used to know whether a new FilterGraphContext or UniqueSwsContext should
|
|
243
|
+
// be created before decoding a new frame.
|
|
244
|
+
DecodedFrameContext prevFrameContext;
|
|
245
|
+
};
|
|
246
|
+
|
|
247
|
+
// --------------------------------------------------------------------------
|
|
248
|
+
// INITIALIZERS
|
|
249
|
+
// --------------------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
void initializeDecoder();
|
|
252
|
+
// --------------------------------------------------------------------------
|
|
253
|
+
// DECODING APIS AND RELATED UTILS
|
|
254
|
+
// --------------------------------------------------------------------------
|
|
255
|
+
|
|
256
|
+
void setCursor(int64_t pts);
|
|
257
|
+
void setCursor(double) = delete; // prevent calls with doubles and floats
|
|
258
|
+
bool canWeAvoidSeeking() const;
|
|
259
|
+
|
|
260
|
+
void maybeSeekToBeforeDesiredPts();
|
|
261
|
+
|
|
262
|
+
UniqueAVFrame decodeAVFrame(
|
|
263
|
+
std::function<bool(const UniqueAVFrame&)> filterFunction);
|
|
264
|
+
|
|
265
|
+
FrameOutput getNextFrameInternal(
|
|
266
|
+
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
|
|
267
|
+
|
|
268
|
+
torch::Tensor maybePermuteHWC2CHW(torch::Tensor& hwcTensor);
|
|
269
|
+
|
|
270
|
+
FrameOutput convertAVFrameToFrameOutput(
|
|
271
|
+
UniqueAVFrame& avFrame,
|
|
272
|
+
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
|
|
273
|
+
|
|
274
|
+
void convertAVFrameToFrameOutputOnCPU(
|
|
275
|
+
UniqueAVFrame& avFrame,
|
|
276
|
+
FrameOutput& frameOutput,
|
|
277
|
+
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
|
|
278
|
+
|
|
279
|
+
void convertAudioAVFrameToFrameOutputOnCPU(
|
|
280
|
+
UniqueAVFrame& srcAVFrame,
|
|
281
|
+
FrameOutput& frameOutput);
|
|
282
|
+
|
|
283
|
+
torch::Tensor convertAVFrameToTensorUsingFilterGraph(
|
|
284
|
+
const UniqueAVFrame& avFrame);
|
|
285
|
+
|
|
286
|
+
int convertAVFrameToTensorUsingSwsScale(
|
|
287
|
+
const UniqueAVFrame& avFrame,
|
|
288
|
+
torch::Tensor& outputTensor);
|
|
289
|
+
|
|
290
|
+
std::optional<torch::Tensor> maybeFlushSwrBuffers();
|
|
291
|
+
|
|
292
|
+
// --------------------------------------------------------------------------
|
|
293
|
+
// COLOR CONVERSION LIBRARIES HANDLERS CREATION
|
|
294
|
+
// --------------------------------------------------------------------------
|
|
295
|
+
|
|
296
|
+
void createFilterGraph(
|
|
297
|
+
StreamInfo& streamInfo,
|
|
298
|
+
int expectedOutputHeight,
|
|
299
|
+
int expectedOutputWidth);
|
|
300
|
+
|
|
301
|
+
void createSwsContext(
|
|
302
|
+
StreamInfo& streamInfo,
|
|
303
|
+
const DecodedFrameContext& frameContext,
|
|
304
|
+
const enum AVColorSpace colorspace);
|
|
305
|
+
|
|
306
|
+
// --------------------------------------------------------------------------
|
|
307
|
+
// PTS <-> INDEX CONVERSIONS
|
|
308
|
+
// --------------------------------------------------------------------------
|
|
309
|
+
|
|
310
|
+
int getKeyFrameIndexForPts(int64_t pts) const;
|
|
311
|
+
|
|
312
|
+
// Returns the key frame index of the presentation timestamp using our index.
|
|
313
|
+
// We build this index by scanning the file in
|
|
314
|
+
// scanFileAndUpdateMetadataAndIndex
|
|
315
|
+
int getKeyFrameIndexForPtsUsingScannedIndex(
|
|
316
|
+
const std::vector<SingleStreamDecoder::FrameInfo>& keyFrames,
|
|
317
|
+
int64_t pts) const;
|
|
318
|
+
|
|
319
|
+
int64_t secondsToIndexLowerBound(double seconds);
|
|
320
|
+
|
|
321
|
+
int64_t secondsToIndexUpperBound(double seconds);
|
|
322
|
+
|
|
323
|
+
int64_t getPts(int64_t frameIndex);
|
|
324
|
+
|
|
325
|
+
// --------------------------------------------------------------------------
|
|
326
|
+
// STREAM AND METADATA APIS
|
|
327
|
+
// --------------------------------------------------------------------------
|
|
328
|
+
|
|
329
|
+
void addStream(
|
|
330
|
+
int streamIndex,
|
|
331
|
+
AVMediaType mediaType,
|
|
332
|
+
const torch::Device& device = torch::kCPU,
|
|
333
|
+
std::optional<int> ffmpegThreadCount = std::nullopt);
|
|
334
|
+
|
|
335
|
+
// Returns the "best" stream index for a given media type. The "best" is
|
|
336
|
+
// determined by various heuristics in FFMPEG.
|
|
337
|
+
// See
|
|
338
|
+
// https://ffmpeg.org/doxygen/trunk/group__lavf__decoding.html#ga757780d38f482deb4d809c6c521fbcc2
|
|
339
|
+
// for more details about the heuristics.
|
|
340
|
+
// Returns the key frame index of the presentation timestamp using FFMPEG's
|
|
341
|
+
// index. Note that this index may be truncated for some files.
|
|
342
|
+
int getBestStreamIndex(AVMediaType mediaType);
|
|
343
|
+
|
|
344
|
+
int64_t getNumFrames(const StreamMetadata& streamMetadata);
|
|
345
|
+
double getMinSeconds(const StreamMetadata& streamMetadata);
|
|
346
|
+
double getMaxSeconds(const StreamMetadata& streamMetadata);
|
|
347
|
+
|
|
348
|
+
// --------------------------------------------------------------------------
|
|
349
|
+
// VALIDATION UTILS
|
|
350
|
+
// --------------------------------------------------------------------------
|
|
351
|
+
|
|
352
|
+
void validateActiveStream(
|
|
353
|
+
std::optional<AVMediaType> avMediaType = std::nullopt);
|
|
354
|
+
void validateScannedAllStreams(const std::string& msg);
|
|
355
|
+
void validateFrameIndex(
|
|
356
|
+
const StreamMetadata& streamMetadata,
|
|
357
|
+
int64_t frameIndex);
|
|
358
|
+
|
|
359
|
+
// --------------------------------------------------------------------------
|
|
360
|
+
// ATTRIBUTES
|
|
361
|
+
// --------------------------------------------------------------------------
|
|
362
|
+
|
|
363
|
+
SeekMode seekMode_;
|
|
364
|
+
ContainerMetadata containerMetadata_;
|
|
365
|
+
UniqueDecodingAVFormatContext formatContext_;
|
|
366
|
+
std::unique_ptr<DeviceInterface> deviceInterface_;
|
|
367
|
+
std::map<int, StreamInfo> streamInfos_;
|
|
368
|
+
const int NO_ACTIVE_STREAM = -2;
|
|
369
|
+
int activeStreamIndex_ = NO_ACTIVE_STREAM;
|
|
370
|
+
|
|
371
|
+
bool cursorWasJustSet_ = false;
|
|
372
|
+
// The desired position of the cursor in the stream. We send frames >= this
|
|
373
|
+
// pts to the user when they request a frame.
|
|
374
|
+
int64_t cursor_ = INT64_MIN;
|
|
375
|
+
// Stores various internal decoding stats.
|
|
376
|
+
DecodeStats decodeStats_;
|
|
377
|
+
// Stores the AVIOContext for the input buffer.
|
|
378
|
+
std::unique_ptr<AVIOContextHolder> avioContextHolder_;
|
|
379
|
+
// Whether or not we have already scanned all streams to update the metadata.
|
|
380
|
+
bool scannedAllStreams_ = false;
|
|
381
|
+
// Tracks that we've already been initialized.
|
|
382
|
+
bool initialized_ = false;
|
|
383
|
+
};
|
|
384
|
+
|
|
385
|
+
// --------------------------------------------------------------------------
|
|
386
|
+
// FRAME TENSOR ALLOCATION APIs
|
|
387
|
+
// --------------------------------------------------------------------------
|
|
388
|
+
|
|
389
|
+
// Note [Frame Tensor allocation and height and width]
|
|
390
|
+
//
|
|
391
|
+
// We always allocate [N]HWC tensors. The low-level decoding functions all
|
|
392
|
+
// assume HWC tensors, since this is what FFmpeg natively handles. It's up to
|
|
393
|
+
// the high-level decoding entry-points to permute that back to CHW, by calling
|
|
394
|
+
// maybePermuteHWC2CHW().
|
|
395
|
+
//
|
|
396
|
+
// Also, importantly, the way we figure out the the height and width of the
|
|
397
|
+
// output frame tensor varies, and depends on the decoding entry-point. In
|
|
398
|
+
// *decreasing order of accuracy*, we use the following sources for determining
|
|
399
|
+
// height and width:
|
|
400
|
+
// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the
|
|
401
|
+
// AVframe, *post*-resizing. This is only used for single-frame decoding APIs,
|
|
402
|
+
// on CPU, with filtergraph.
|
|
403
|
+
// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from
|
|
404
|
+
// the user-specified options if they exist, or the height and width of the
|
|
405
|
+
// AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
|
|
406
|
+
// our code or within FFmpeg code, this should be exactly the same as
|
|
407
|
+
// getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
|
|
408
|
+
// decoding APIs, on CPU with swscale, and on GPU.
|
|
409
|
+
// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
|
|
410
|
+
// the user-specified options if they exist, or the height and width form the
|
|
411
|
+
// stream metadata, which itself got its value from the CodecContext, when the
|
|
412
|
+
// stream was added. This is used by batch decoding APIs, for both GPU and
|
|
413
|
+
// CPU.
|
|
414
|
+
//
|
|
415
|
+
// The source of truth for height and width really is the (resized) AVFrame: it
|
|
416
|
+
// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e.
|
|
417
|
+
// from the CodecContext) may not be as accurate. However, the AVFrame is only
|
|
418
|
+
// available late in the call stack, when the frame is decoded, while the
|
|
419
|
+
// CodecContext is available early when a stream is added. This is why we use
|
|
420
|
+
// the CodecContext for pre-allocating batched output tensors (we could
|
|
421
|
+
// pre-allocate those only once we decode the first frame to get the info frame
|
|
422
|
+
// the AVFrame, but that's a more complex logic).
|
|
423
|
+
//
|
|
424
|
+
// Because the sources for height and width may disagree, we may end up with
|
|
425
|
+
// conflicts: e.g. if we pre-allocate a batch output tensor based on the
|
|
426
|
+
// metadata info, but the decoded AVFrame has a different height and width.
|
|
427
|
+
// it is very important to check the height and width assumptions where the
|
|
428
|
+
// tensors memory is used/filled in order to avoid segfaults.
|
|
429
|
+
|
|
430
|
+
struct FrameDims {
|
|
431
|
+
int height;
|
|
432
|
+
int width;
|
|
433
|
+
|
|
434
|
+
FrameDims(int h, int w) : height(h), width(w) {}
|
|
435
|
+
};
|
|
436
|
+
|
|
437
|
+
// There's nothing preventing you from calling this on a non-resized frame, but
|
|
438
|
+
// please don't.
|
|
439
|
+
FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
|
|
440
|
+
|
|
441
|
+
FrameDims getHeightAndWidthFromOptionsOrMetadata(
|
|
442
|
+
const VideoStreamOptions& videoStreamOptions,
|
|
443
|
+
const StreamMetadata& streamMetadata);
|
|
444
|
+
|
|
445
|
+
FrameDims getHeightAndWidthFromOptionsOrAVFrame(
|
|
446
|
+
const VideoStreamOptions& videoStreamOptions,
|
|
447
|
+
const UniqueAVFrame& avFrame);
|
|
448
|
+
|
|
449
|
+
torch::Tensor allocateEmptyHWCTensor(
|
|
450
|
+
int height,
|
|
451
|
+
int width,
|
|
452
|
+
torch::Device device,
|
|
453
|
+
std::optional<int> numFrames = std::nullopt);
|
|
454
|
+
|
|
455
|
+
// Prints the SingleStreamDecoder::DecodeStats to the ostream.
|
|
456
|
+
std::ostream& operator<<(
|
|
457
|
+
std::ostream& os,
|
|
458
|
+
const SingleStreamDecoder::DecodeStats& stats);
|
|
459
|
+
|
|
460
|
+
SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode);
|
|
461
|
+
|
|
462
|
+
} // namespace facebook::torchcodec
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
// All rights reserved.
|
|
3
|
+
//
|
|
4
|
+
// This source code is licensed under the BSD-style license found in the
|
|
5
|
+
// LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
#pragma once
|
|
8
|
+
|
|
9
|
+
#include <torch/types.h>
|
|
10
|
+
#include <optional>
|
|
11
|
+
#include <string>
|
|
12
|
+
|
|
13
|
+
namespace facebook::torchcodec {
|
|
14
|
+
|
|
15
|
+
enum ColorConversionLibrary {
|
|
16
|
+
// TODO: Add an AUTO option later.
|
|
17
|
+
// Use the libavfilter library for color conversion.
|
|
18
|
+
FILTERGRAPH,
|
|
19
|
+
// Use the libswscale library for color conversion.
|
|
20
|
+
SWSCALE
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
struct VideoStreamOptions {
|
|
24
|
+
VideoStreamOptions() {}
|
|
25
|
+
|
|
26
|
+
// Number of threads we pass to FFMPEG for decoding.
|
|
27
|
+
// 0 means FFMPEG will choose the number of threads automatically to fully
|
|
28
|
+
// utilize all cores. If not set, it will be the default FFMPEG behavior for
|
|
29
|
+
// the given codec.
|
|
30
|
+
std::optional<int> ffmpegThreadCount;
|
|
31
|
+
// Currently the dimension order can be either NHWC or NCHW.
|
|
32
|
+
// H=height, W=width, C=channel.
|
|
33
|
+
std::string dimensionOrder = "NCHW";
|
|
34
|
+
// The output height and width of the frame. If not specified, the output
|
|
35
|
+
// is the same as the original video.
|
|
36
|
+
std::optional<int> width;
|
|
37
|
+
std::optional<int> height;
|
|
38
|
+
std::optional<ColorConversionLibrary> colorConversionLibrary;
|
|
39
|
+
// By default we use CPU for decoding for both C++ and python users.
|
|
40
|
+
torch::Device device = torch::kCPU;
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
struct AudioStreamOptions {
|
|
44
|
+
AudioStreamOptions() {}
|
|
45
|
+
|
|
46
|
+
std::optional<int> sampleRate;
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
} // namespace facebook::torchcodec
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
from ._metadata import (
|
|
9
|
+
AudioStreamMetadata,
|
|
10
|
+
ContainerMetadata,
|
|
11
|
+
get_container_metadata,
|
|
12
|
+
get_container_metadata_from_header,
|
|
13
|
+
VideoStreamMetadata,
|
|
14
|
+
)
|
|
15
|
+
from .ops import (
|
|
16
|
+
_add_video_stream,
|
|
17
|
+
_get_key_frame_indices,
|
|
18
|
+
_test_frame_pts_equality,
|
|
19
|
+
add_audio_stream,
|
|
20
|
+
add_video_stream,
|
|
21
|
+
create_audio_encoder,
|
|
22
|
+
create_from_bytes,
|
|
23
|
+
create_from_file,
|
|
24
|
+
create_from_file_like,
|
|
25
|
+
create_from_tensor,
|
|
26
|
+
encode_audio,
|
|
27
|
+
get_ffmpeg_library_versions,
|
|
28
|
+
get_frame_at_index,
|
|
29
|
+
get_frame_at_pts,
|
|
30
|
+
get_frames_at_indices,
|
|
31
|
+
get_frames_by_pts,
|
|
32
|
+
get_frames_by_pts_in_range,
|
|
33
|
+
get_frames_by_pts_in_range_audio,
|
|
34
|
+
get_frames_in_range,
|
|
35
|
+
get_json_metadata,
|
|
36
|
+
get_next_frame,
|
|
37
|
+
scan_all_streams_to_update_metadata,
|
|
38
|
+
seek_to_pts,
|
|
39
|
+
)
|