torchcodec 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 292f24721c347356202f9f0c691dcb00bf4835208f7a9e8730964014731a3202
4
- data.tar.gz: 9ae50e37cf15a1691aec3dddd07f46dc7a44a32f92ea0a90a99958341b4b5131
3
+ metadata.gz: 14bab54a5a5fd6d29ea5d09b11a85d53babcadd9a691201a0b07c759ed780216
4
+ data.tar.gz: dff1c8af2705ad7907d2c3b099248245d89382851cadb3bacad572e7052d15ae
5
5
  SHA512:
6
- metadata.gz: a0d72c1d15a3f67dde0ee86807ad2c72e6bb6da541ec1ad9dfb2ddd1fca4ceb35d428bf279ef704b923a5f6faaccdb2aa6ec2b5b62542aec8d1a186e64788eba
7
- data.tar.gz: 34d6865ce8da7c5a0d451633c9b8a4b4b42aafe55ad0c200e870b2e3809c83b3c872b40ba829d68fcacddaeac0de88fd206322248621fb0e077dac8aacc833e9
6
+ metadata.gz: deffa4d9dbc71fb9ea9551efdf0092d304f757dc7fd5fae57b7c88671f78fbf9fafeca4454cb0b369a12d6ea43ff9c2b46cfc5ba27d2637325252bbab890d20e
7
+ data.tar.gz: 074d93f9852592f64dd09ac449f90f9db194fb72c7088f5fefab3190e0bcfef61f86732472733b970645cc7eddaf002924520fc933ee816952dbde1b3d66d2b9
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.1.1 (2026-02-27)
2
+
3
+ - Added `AudioEncoder` class
4
+ - Added `VideoEncoder` and `VideoDecoder` classes
5
+
1
6
  ## 0.1.0 (2026-02-26)
2
7
 
3
8
  - First release
data/README.md CHANGED
@@ -20,12 +20,48 @@ gem "torchcodec"
20
20
 
21
21
  ## Getting Started
22
22
 
23
- This library follows the [Python API](https://meta-pytorch.org/torchcodec/). Most functionality is missing at the moment. PRs welcome!
23
+ This library follows the [Python API](https://meta-pytorch.org/torchcodec/). Some functionality is missing at the moment. PRs welcome!
24
+
25
+ ### Audio
26
+
27
+ Encoding
28
+
29
+ ```ruby
30
+ encoder = TorchCodec::Encoders::AudioEncoder.new(samples, sample_rate: 8000)
31
+ encoder.to_file("file.mp3")
32
+ tensor = encoder.to_tensor("mp3")
33
+ ```
34
+
35
+ Decoding
24
36
 
25
37
  ```ruby
26
38
  decoder = TorchCodec::Decoders::AudioDecoder.new("file.mp3")
27
39
  decoder.metadata
28
40
  decoder.get_all_samples
41
+ decoder.get_samples_played_in_range(start_seconds: 0, stop_seconds: 1)
42
+ ```
43
+
44
+ ### Video
45
+
46
+ Encoding
47
+
48
+ ```ruby
49
+ encoder = TorchCodec::Encoders::VideoEncoder.new(frames, frame_rate: 24)
50
+ encoder.to_file("file.mp4")
51
+ tensor = encoder.to_tensor("mp4")
52
+ ```
53
+
54
+ Decoding
55
+
56
+ ```ruby
57
+ decoder = TorchCodec::Decoders::VideoDecoder.new("file.mp4")
58
+ decoder.metadata
59
+ decoder.get_frame_at(0)
60
+ decoder.get_frames_at(Torch.tensor([0, 1, 2]))
61
+ decoder.get_frames_in_range(0, 10, step: 3)
62
+ decoder.get_frame_played_at(0)
63
+ decoder.get_frames_played_at(Torch.tensor([0, 1, 2], dtype: :float64))
64
+ decoder.get_frames_played_in_range(0, 10)
29
65
  ```
30
66
 
31
67
  ## FFmpeg Installation
@@ -906,8 +906,19 @@ void scan_all_streams_to_update_metadata(at::Tensor& decoder) {
906
906
  void init_core(Rice::Module m) {
907
907
  m
908
908
  .define_singleton_function("create_from_file", &facebook::torchcodec::create_from_file)
909
+ .define_singleton_function("encode_audio_to_file", &facebook::torchcodec::encode_audio_to_file)
910
+ .define_singleton_function("encode_audio_to_tensor", &facebook::torchcodec::encode_audio_to_tensor)
911
+ .define_singleton_function("encode_video_to_file", &facebook::torchcodec::encode_video_to_file)
912
+ .define_singleton_function("encode_video_to_tensor", &facebook::torchcodec::encode_video_to_tensor)
913
+ .define_singleton_function("add_video_stream", &facebook::torchcodec::add_video_stream)
914
+ .define_singleton_function("add_audio_stream", &facebook::torchcodec::add_audio_stream)
909
915
  .define_singleton_function("_get_container_json_metadata", &facebook::torchcodec::get_container_json_metadata)
910
916
  .define_singleton_function("_get_stream_json_metadata", &facebook::torchcodec::get_stream_json_metadata)
911
- .define_singleton_function("add_audio_stream", &facebook::torchcodec::add_audio_stream)
912
- .define_singleton_function("get_frames_by_pts_in_range_audio", &facebook::torchcodec::get_frames_by_pts_in_range_audio);
917
+ .define_singleton_function("get_frame_at_pts", &facebook::torchcodec::get_frame_at_pts)
918
+ .define_singleton_function("get_frame_at_index", &facebook::torchcodec::get_frame_at_index)
919
+ .define_singleton_function("get_frames_at_indices", &facebook::torchcodec::get_frames_at_indices)
920
+ .define_singleton_function("get_frames_in_range", &facebook::torchcodec::get_frames_in_range)
921
+ .define_singleton_function("get_frames_by_pts_in_range", &facebook::torchcodec::get_frames_by_pts_in_range)
922
+ .define_singleton_function("get_frames_by_pts_in_range_audio", &facebook::torchcodec::get_frames_by_pts_in_range_audio)
923
+ .define_singleton_function("get_frames_by_pts", &facebook::torchcodec::get_frames_by_pts);
913
924
  }
@@ -1,5 +1,16 @@
1
1
  module TorchCodec
2
2
  module Core
3
+ def self._get_optional_par_fraction(stream_dict)
4
+ begin
5
+ Rational(
6
+ stream_dict.fetch("sampleAspectRatioNum"),
7
+ stream_dict.fetch("sampleAspectRatioDen")
8
+ )
9
+ rescue KeyError
10
+ nil
11
+ end
12
+ end
13
+
3
14
  def self.get_container_metadata(decoder)
4
15
  container_dict = JSON.parse(_get_container_json_metadata(decoder))
5
16
  streams_metadata = []
@@ -16,7 +27,20 @@ module TorchCodec
16
27
  stream_index: stream_index
17
28
  }
18
29
  if stream_dict["mediaType"] == "video"
19
- raise Todo
30
+ streams_metadata << {
31
+ begin_stream_seconds_from_content: stream_dict["beginStreamSecondsFromContent"],
32
+ end_stream_seconds_from_content: stream_dict["endStreamSecondsFromContent"],
33
+ end_stream_seconds: stream_dict["endStreamSeconds"],
34
+ num_frames: stream_dict["numFrames"],
35
+ average_fps: stream_dict["averageFps"],
36
+ width: stream_dict["width"],
37
+ height: stream_dict["height"],
38
+ num_frames_from_header: stream_dict["numFramesFromHeader"],
39
+ num_frames_from_content: stream_dict["numFramesFromContent"],
40
+ average_fps_from_header: stream_dict["averageFpsFromHeader"],
41
+ pixel_aspect_ratio: _get_optional_par_fraction(stream_dict),
42
+ **common_meta
43
+ }
20
44
  elsif stream_dict["mediaType"] == "audio"
21
45
  streams_metadata << {
22
46
  sample_rate: stream_dict["sampleRate"],
@@ -25,7 +49,10 @@ module TorchCodec
25
49
  **common_meta
26
50
  }
27
51
  else
28
- raise Todo
52
+ # This is neither a video nor audio stream. Could be e.g. subtitles.
53
+ # We still need to add a dummy entry so that len(streams_metadata)
54
+ # is consistent with the number of streams.
55
+ streams_metadata << common_meta
29
56
  end
30
57
  end
31
58
 
@@ -7,5 +7,10 @@ module TorchCodec
7
7
  raise TypeError, "Unknown source type: #{source.class.name}"
8
8
  end
9
9
  end
10
+
11
+ def self._get_cuda_backend
12
+ # TODO improve
13
+ "ffmpeg"
14
+ end
10
15
  end
11
16
  end
@@ -0,0 +1,208 @@
1
+ module TorchCodec
2
+ module Decoders
3
+ class VideoDecoder
4
+ attr_reader :metadata
5
+
6
+ def initialize(
7
+ source,
8
+ stream_index: nil,
9
+ dimension_order: "NCHW",
10
+ num_ffmpeg_threads: 1,
11
+ device: nil,
12
+ seek_mode: "exact",
13
+ transforms: nil,
14
+ custom_frame_mappings: nil
15
+ )
16
+ allowed_seek_modes = ["exact", "approximate"]
17
+ if !allowed_seek_modes.include?(seek_mode)
18
+ raise ArgumentError, "Invalid seek mode (#{seek_mode})."
19
+ end
20
+
21
+ # Validate seek_mode and custom_frame_mappings are not mismatched
22
+ if !custom_frame_mappings.nil? && seek_mode == "approximate"
23
+ raise ArgumentError,
24
+ "custom_frame_mappings is incompatible with seek_mode: 'approximate'. " +
25
+ "Use seek_mode: 'custom_frame_mappings' or leave it unspecified to automatically use custom frame mappings."
26
+ end
27
+
28
+ # Auto-select custom_frame_mappings seek_mode and process data when mappings are provided
29
+ custom_frame_mappings_data = nil
30
+ if !custom_frame_mappings.nil?
31
+ raise Todo
32
+ end
33
+
34
+ @decoder = Decoders.create_decoder(source, seek_mode)
35
+
36
+ (
37
+ @metadata,
38
+ @stream_index,
39
+ @begin_stream_seconds,
40
+ @end_stream_seconds,
41
+ @num_frames
42
+ ) = _get_and_validate_stream_metadata(
43
+ decoder: @decoder, stream_index: stream_index
44
+ )
45
+
46
+ allowed_dimension_orders = ["NCHW", "NHWC"]
47
+ if !allowed_dimension_orders.include?(dimension_order)
48
+ raise ArgumentError, "Invalid dimension order (#{dimension_order})."
49
+ end
50
+
51
+ if num_ffmpeg_threads.nil?
52
+ raise ArgumentError, "#{num_ffmpeg_threads} should be an int."
53
+ end
54
+
55
+ if device.nil?
56
+ device = "cpu" # TODO Torch.get_default_device.to_s
57
+ elsif device.is_a?(Torch::Device)
58
+ device = device.to_s
59
+ end
60
+
61
+ device_variant = Decoders._get_cuda_backend
62
+ transform_specs = Transforms._make_transform_specs(
63
+ transforms,
64
+ [@metadata[:height], @metadata[:width]]
65
+ )
66
+
67
+ Core.add_video_stream(
68
+ @decoder,
69
+ num_ffmpeg_threads,
70
+ dimension_order,
71
+ @stream_index,
72
+ device,
73
+ device_variant,
74
+ transform_specs,
75
+ custom_frame_mappings_data
76
+ )
77
+ end
78
+
79
+ def get_frame_at(index)
80
+ data, pts_seconds, duration_seconds = Core.get_frame_at_index(@decoder, index)
81
+ {
82
+ data: data,
83
+ pts_seconds: pts_seconds.item,
84
+ duration_seconds: duration_seconds.item
85
+ }
86
+ end
87
+
88
+ def get_frames_at(indices)
89
+ data, pts_seconds, duration_seconds = Core.get_frames_at_indices(@decoder, indices)
90
+
91
+ {
92
+ data: data,
93
+ pts_seconds: pts_seconds,
94
+ duration_seconds: duration_seconds
95
+ }
96
+ end
97
+
98
+ def get_frames_in_range(start, stop, step: 1)
99
+ frames = Core.get_frames_in_range(
100
+ @decoder,
101
+ start,
102
+ stop,
103
+ step
104
+ )
105
+ {
106
+ data: frames[0],
107
+ pts_seconds: frames[1],
108
+ duration_seconds: frames[2]
109
+ }
110
+ end
111
+
112
+ def get_frame_played_at(seconds)
113
+ if !(@begin_stream_seconds <= seconds && seconds < @end_stream_seconds)
114
+ raise IndexError, "Invalid pts in seconds: #{seconds}."
115
+ end
116
+ data, pts_seconds, duration_seconds = Core.get_frame_at_pts(
117
+ @decoder, seconds
118
+ )
119
+ {
120
+ data: data,
121
+ pts_seconds: pts_seconds.item,
122
+ duration_seconds: duration_seconds.item
123
+ }
124
+ end
125
+
126
+ def get_frames_played_at(seconds)
127
+ data, pts_seconds, duration_seconds = Core.get_frames_by_pts(
128
+ @decoder, seconds
129
+ )
130
+ {
131
+ data: data,
132
+ pts_seconds: pts_seconds,
133
+ duration_seconds: duration_seconds
134
+ }
135
+ end
136
+
137
+ def get_frames_played_in_range(start_seconds, stop_seconds)
138
+ if !(start_seconds <= stop_seconds)
139
+ raise ArgumentError, "Invalid start seconds: #{start_seconds}. It must be less than or equal to stop seconds (#{stop_seconds})."
140
+ end
141
+ if !(@begin_stream_seconds <= start_seconds && start_seconds < @end_stream_seconds)
142
+ raise ArgumentError, "Invalid start seconds: #{start_seconds}."
143
+ end
144
+ if !(stop_seconds <= @end_stream_seconds)
145
+ raise ArgumentError, "Invalid stop seconds: #{stop_seconds}."
146
+ end
147
+ frames = Core.get_frames_by_pts_in_range(
148
+ @decoder,
149
+ start_seconds,
150
+ stop_seconds
151
+ )
152
+
153
+ {
154
+ data: frames[0],
155
+ pts_seconds: frames[1],
156
+ duration_seconds: frames[2]
157
+ }
158
+ end
159
+
160
+ private
161
+
162
+ def _get_and_validate_stream_metadata(
163
+ decoder:,
164
+ stream_index: nil
165
+ )
166
+ container_metadata = Core.get_container_metadata(decoder)
167
+
168
+ if stream_index.nil?
169
+ if (stream_index = container_metadata[:best_video_stream_index]).nil?
170
+ raise ArgumentError, "The best video stream is unknown and there is no specified stream."
171
+ end
172
+ end
173
+
174
+ if stream_index >= container_metadata[:streams].length
175
+ raise ArgumentError, "The stream index #{stream_index} is not a valid stream."
176
+ end
177
+
178
+ metadata = container_metadata[:streams][stream_index]
179
+ if !metadata.key?(:begin_stream_seconds_from_content)
180
+ raise ArgumentError, "The stream at index #{stream_index} is not a video stream."
181
+ end
182
+
183
+ if metadata[:begin_stream_seconds].nil?
184
+ raise ArgumentError, "The minimum pts value in seconds is unknown."
185
+ end
186
+ begin_stream_seconds = metadata[:begin_stream_seconds]
187
+
188
+ if metadata[:end_stream_seconds].nil?
189
+ raise ArgumentError, "The maximum pts value in seconds is unknown."
190
+ end
191
+ end_stream_seconds = metadata[:end_stream_seconds]
192
+
193
+ if metadata[:num_frames].nil?
194
+ raise ArgumentError, "The number of frames is unknown."
195
+ end
196
+ num_frames = metadata[:num_frames]
197
+
198
+ [
199
+ metadata,
200
+ stream_index,
201
+ begin_stream_seconds,
202
+ end_stream_seconds,
203
+ num_frames
204
+ ]
205
+ end
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,62 @@
1
+ module TorchCodec
2
+ module Encoders
3
+ class AudioEncoder
4
+ def initialize(samples, sample_rate:)
5
+ # Some of these checks are also done in C++: it's OK, they're cheap, and
6
+ # doing them here allows to surface them when the AudioEncoder is
7
+ # instantiated, rather than later when the encoding methods are called.
8
+ if !samples.is_a?(Torch::Tensor)
9
+ raise ArgumentError, "Expected samples to be a Tensor, got #{samples.class.name}."
10
+ end
11
+ if samples.ndim == 1
12
+ # make it 2D and assume 1 channel
13
+ samples = Torch.unsqueeze(samples, 0)
14
+ end
15
+ if samples.ndim != 2
16
+ raise ArgumentError, "Expected 1D or 2D samples, got #{samples.shape}."
17
+ end
18
+ if samples.dtype != Torch.float32
19
+ raise ArgumentError, "Expected float32 samples, got #{samples.dtype}."
20
+ end
21
+ if sample_rate <= 0
22
+ raise ArgumentError, "#{sample_rate} must be > 0."
23
+ end
24
+
25
+ @samples = samples
26
+ @sample_rate = sample_rate
27
+ end
28
+
29
+ def to_file(
30
+ dest,
31
+ bit_rate: nil,
32
+ num_channels: nil,
33
+ sample_rate: nil
34
+ )
35
+ Core.encode_audio_to_file(
36
+ @samples,
37
+ @sample_rate,
38
+ dest.to_s,
39
+ bit_rate,
40
+ num_channels,
41
+ sample_rate
42
+ )
43
+ end
44
+
45
+ def to_tensor(
46
+ format,
47
+ bit_rate: nil,
48
+ num_channels: nil,
49
+ sample_rate: nil
50
+ )
51
+ Core.encode_audio_to_tensor(
52
+ @samples,
53
+ @sample_rate,
54
+ format,
55
+ bit_rate,
56
+ num_channels,
57
+ sample_rate
58
+ )
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,65 @@
1
+ module TorchCodec
2
+ module Encoders
3
+ class VideoEncoder
4
+ def initialize(frames, frame_rate:)
5
+ if !frames.is_a?(Torch::Tensor)
6
+ raise ArgumentError, "Expected frames to be a Tensor, got #{frames.class.name}."
7
+ end
8
+ if frames.ndim != 4
9
+ raise ArgumentError, "Expected 4D frames, got #{frames.shape}."
10
+ end
11
+ if frames.dtype != Torch.uint8
12
+ raise ArgumentError, "Expected uint8 frames, got #{frames.dtype}."
13
+ end
14
+ if frame_rate <= 0
15
+ raise ArgumentError, "#{frame_rate} must be > 0."
16
+ end
17
+
18
+ @frames = frames
19
+ @frame_rate = frame_rate
20
+ end
21
+
22
+ def to_file(
23
+ dest,
24
+ codec: nil,
25
+ pixel_format: nil,
26
+ crf: nil,
27
+ preset: nil,
28
+ extra_options: nil
29
+ )
30
+ preset = preset.is_a?(Integer) ? preset.to_s : preset
31
+ Core.encode_video_to_file(
32
+ @frames,
33
+ @frame_rate,
34
+ dest.to_s,
35
+ codec,
36
+ pixel_format,
37
+ crf,
38
+ preset,
39
+ extra_options
40
+ )
41
+ end
42
+
43
+ def to_tensor(
44
+ format,
45
+ codec: nil,
46
+ pixel_format: nil,
47
+ crf: nil,
48
+ preset: nil,
49
+ extra_options: nil
50
+ )
51
+ preset_value = preset.is_a?(Integer) ? preset.to_s : preset
52
+ Core.encode_video_to_tensor(
53
+ @frames,
54
+ @frame_rate,
55
+ format,
56
+ codec,
57
+ pixel_format,
58
+ crf,
59
+ preset_value,
60
+ extra_options
61
+ )
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,14 @@
1
+ module TorchCodec
2
+ module Transforms
3
+ def self._make_transform_specs(
4
+ transforms,
5
+ input_dims
6
+ )
7
+ if transforms.nil?
8
+ return ""
9
+ end
10
+
11
+ raise Todo
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  module TorchCodec
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/torchcodec.rb CHANGED
@@ -16,6 +16,14 @@ require_relative "torchcodec/core/metadata"
16
16
  # decoders
17
17
  require_relative "torchcodec/decoders/audio_decoder"
18
18
  require_relative "torchcodec/decoders/decoder_utils"
19
+ require_relative "torchcodec/decoders/video_decoder"
20
+
21
+ # encoders
22
+ require_relative "torchcodec/encoders/audio_encoder"
23
+ require_relative "torchcodec/encoders/video_encoder"
24
+
25
+ # transforms
26
+ require_relative "torchcodec/transforms/decoder_transforms"
19
27
 
20
28
  module TorchCodec
21
29
  class Error < StandardError; end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: torchcodec
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -65,6 +65,10 @@ files:
65
65
  - lib/torchcodec/core/metadata.rb
66
66
  - lib/torchcodec/decoders/audio_decoder.rb
67
67
  - lib/torchcodec/decoders/decoder_utils.rb
68
+ - lib/torchcodec/decoders/video_decoder.rb
69
+ - lib/torchcodec/encoders/audio_encoder.rb
70
+ - lib/torchcodec/encoders/video_encoder.rb
71
+ - lib/torchcodec/transforms/decoder_transforms.rb
68
72
  - lib/torchcodec/version.rb
69
73
  homepage: https://github.com/ankane/torchcodec-ruby
70
74
  licenses: