yta-video-opengl 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,17 +15,33 @@ frame we are requesting in the moment, keeping in
15
15
  memory all those frames to be handled fast. It
16
16
  will remove the old frames if needed to use only
17
17
  the 'size' we set when creating it.
18
+
19
+ A stream can have 'fps = 60' but use another
20
+ different time base that make the pts values go 0,
21
+ 256, 512... for example. The 'time_base' is the
22
+ only accurate way to obtain the pts.
23
+
24
+ Feel free to move this explanation to other
25
+ place, its about the duration.
26
+
27
+ The stream 'duration' parameter is measured
28
+ on ticks, the amount of ticks that the
29
+ stream lasts. Here below is an example:
30
+
31
+ - Duration raw: 529200
32
+ - Time base: 1/44100
33
+ - Duration (seconds): 12.0
18
34
  """
19
- from yta_video_opengl.utils import t_to_pts, pts_to_t, pts_to_index, index_to_pts
20
- from yta_video_frame_time import T
35
+ from yta_video_opengl.t import T
21
36
  from av.container import InputContainer
22
37
  from av.video.stream import VideoStream
23
38
  from av.audio.stream import AudioStream
24
39
  from av.video.frame import VideoFrame
25
40
  from av.audio.frame import AudioFrame
41
+ from av.packet import Packet
26
42
  from yta_validation.parameter import ParameterValidator
27
43
  from yta_validation import PythonValidator
28
- from fractions import Fraction
44
+ from quicktions import Fraction
29
45
  from collections import OrderedDict
30
46
  from typing import Union
31
47
 
@@ -33,6 +49,10 @@ import numpy as np
33
49
  import math
34
50
 
35
51
 
52
+ # TODO: This is not actually a Video
53
+ # cache, is a FrameCache because we
54
+ # create one for video but another
55
+ # one for audio. Rename it please.
36
56
  class VideoFrameCache:
37
57
  """
38
58
  Class to manage the frames cache of a video
@@ -42,14 +62,14 @@ class VideoFrameCache:
42
62
  @property
43
63
  def fps(
44
64
  self
45
- ) -> float:
65
+ ) -> Union[int, Fraction, None]:
46
66
  """
47
- The frames per second as a float.
67
+ The frames per second.
48
68
  """
49
69
  return (
50
- float(self.stream.average_rate)
70
+ self.stream.average_rate
51
71
  if self.stream.type == 'video' else
52
- float(self.stream.rate)
72
+ self.stream.rate
53
73
  )
54
74
 
55
75
  @property
@@ -94,6 +114,31 @@ class VideoFrameCache:
94
114
  end.
95
115
  """
96
116
 
117
+ # TODO: This is new, remove this comment if
118
+ # it is ok
119
+ # TODO: This way of obtaining the duration
120
+ # in ticks must be a utils
121
+ self.frame_duration: int = (
122
+ self.stream.duration / self.stream.frames
123
+ if PythonValidator.is_instance_of(stream, VideoStream) else
124
+ # TODO: Is this below ok (?)
125
+ self.stream.frames
126
+ )
127
+ """
128
+ The duration (in ticks) of the frame, that
129
+ is the step between the different pts.
130
+ """
131
+ self._last_packet_accessed: Union[Packet, None] = None
132
+ """
133
+ The last packet that has been accessed
134
+ """
135
+ self._last_frame_read: Union[VideoFrame, AudioFrame, None] = None
136
+ """
137
+ The last frame we have read when decoding.
138
+ Useful to avoid seeking all the time when we
139
+ don't need it.
140
+ """
141
+
97
142
  self._prepare()
98
143
 
99
144
  def _prepare(
@@ -108,6 +153,7 @@ class VideoFrameCache:
108
153
  # use the amount of frames of the biggest
109
154
  # interval of frames that belongs to a key
110
155
  # frame, or a value by default
156
+ # TODO: Careful if this is too big
111
157
  fps = (
112
158
  float(self.stream.average_rate)
113
159
  if PythonValidator.is_instance_of(self.stream, VideoStream) else
@@ -116,7 +162,7 @@ class VideoFrameCache:
116
162
  # Intervals, but in number of frames
117
163
  intervals = np.diff(
118
164
  # Intervals of time between keyframes
119
- np.array(self.key_frames_pts) * self.stream.time_base
165
+ np.array(self.key_frames_pts) * self.time_base
120
166
  ) * fps
121
167
 
122
168
  self.size = (
@@ -131,7 +177,7 @@ class VideoFrameCache:
131
177
 
132
178
  self.container.seek(0)
133
179
 
134
- def _get_nearest_keyframe_fps(
180
+ def _get_nearest_keyframe_pts(
135
181
  self,
136
182
  pts: int
137
183
  ):
@@ -157,7 +203,6 @@ class VideoFrameCache:
157
203
  the cache if full.
158
204
  """
159
205
  if frame.pts not in self.cache:
160
- # TODO: The 'format' must be dynamic
161
206
  self.cache[frame.pts] = frame
162
207
 
163
208
  # Clean cache if full
@@ -165,145 +210,161 @@ class VideoFrameCache:
165
210
  self.cache.popitem(last = False)
166
211
 
167
212
  return frame
168
-
169
- def get_frame_from_pts(
213
+
214
+ def _seek(
170
215
  self,
171
216
  pts: int
172
- ) -> Union[VideoFrame, AudioFrame, None]:
217
+ ):
173
218
  """
174
- Get the frame that has the provided 'pts'.
219
+ Seek to the given 'pts' only if it is not
220
+ the next 'pts' to the last read, and it
221
+ will also apply a pad to avoid problems
222
+ when reading audio frames.
175
223
 
176
- This method will start decoding frames from the
177
- most near key frame (the one with the nearer
178
- pts) until the one requested is found. All those
179
- frames will be stored in cache.
224
+ TODO: Apply the padding only to audio
225
+ frame reading (?)
226
+ """
227
+ # I found that it is recommended to
228
+ # read ~100ms before the pts we want to
229
+ # actually read so we obtain the frames
230
+ # clean (this is important in audio)
231
+ # TODO: This is maybe too much for a
232
+ # video and not needed
233
+ pts_pad = int(0.1 / self.time_base)
234
+ self.container.seek(
235
+ offset = max(0, pts - pts_pad),
236
+ stream = self.stream
237
+ )
180
238
 
181
- This method must be called when the frame
182
- requested is not stored in the caché.
239
+ def get_video_frame(
240
+ self,
241
+ t: Union[int, float, Fraction]
242
+ ) -> VideoFrame:
183
243
  """
184
- if pts in self.cache:
185
- return self.cache[pts]
186
-
187
- # Look for the most near key frame
188
- key_frame_pts = self._get_nearest_keyframe_fps(pts)
244
+ Get the video frame that is in the 't'
245
+ time moment provided.
246
+ """
247
+ for frame in self.get_video_frames(t):
248
+ return frame
189
249
 
190
- # Go to the key frame that includes it
191
- self.container.seek(key_frame_pts, stream = self.stream)
250
+ def get_video_frames(
251
+ self,
252
+ start: Union[int, float, Fraction] = 0,
253
+ end: Union[int, float, Fraction, None] = None
254
+ ):
255
+ """
256
+ Get all the frames in the range between
257
+ the provided 'start' and 'end' time in
258
+ seconds.
259
+
260
+ This method is an iterator that yields
261
+ the frame, its t and its index.
262
+ """
263
+ start = T(start, self.time_base).truncated
264
+ end = (
265
+ T(end, self.time_base).truncated
266
+ if end is not None else
267
+ # The next frame
268
+ start + (1 / self.fps)
269
+ )
270
+
271
+ key_frame_pts = self._get_nearest_keyframe_pts(start / self.time_base)
192
272
 
193
- decoded = None
194
- for frame in self.container.decode(self.stream):
195
- # TODO: Could 'frame' be None (?)
196
- if frame.pts is None:
273
+ if (
274
+ self._last_packet_accessed is None or
275
+ self._last_packet_accessed.pts != key_frame_pts
276
+ ):
277
+ self._seek(key_frame_pts)
278
+
279
+ for packet in self.container.demux(self.stream):
280
+ if packet.pts is None:
197
281
  continue
198
282
 
199
- # Store in cache if needed
200
- self._store_frame_in_cache(frame)
283
+ self._last_packet_accessed = packet
201
284
 
202
- if frame.pts >= pts:
203
- decoded = self.cache[frame.pts]
204
- break
285
+ for frame in packet.decode():
286
+ if frame.pts is None:
287
+ continue
205
288
 
206
- # TODO: Is this working? We need previous
207
- # frames to be able to decode...
208
- return decoded
289
+ # We store all the frames in cache
290
+ self._store_frame_in_cache(frame)
291
+
292
+ current_frame_time = frame.pts * self.time_base
293
+
294
+ # We want the range [start, end)
295
+ if start <= current_frame_time < end:
296
+ yield frame
209
297
 
210
- def get_frame(
298
+ if current_frame_time >= end:
299
+ break
300
+
301
+ def get_audio_frame_from_t(
211
302
  self,
212
- index: int
213
- ) -> Union[VideoFrame, AudioFrame]:
303
+ t: Union[int, float, Fraction]
304
+ ):
214
305
  """
215
- Get the frame with the given 'index' from
216
- the cache.
306
+ Get the single audio frame that must be
307
+ played at the 't' time moment provided.
308
+ This method is useful to get the single
309
+ audio frame that we need to combine
310
+ when using it in a composition.
311
+
312
+ TODO: Are we actually using this method (?)
217
313
  """
218
- # TODO: Maybe we can accept 'pts' also
219
- pts = index_to_pts(index, self.time_base, self.fps)
314
+ t: T = T(t, self.time_base)
315
+ # We need the just one audio frame
316
+ for frame in self.get_audio_frames(t.truncated, t.next(1).truncated):
317
+ return frame
220
318
 
221
- return (
222
- self.cache[pts]
223
- if pts in self.cache else
224
- self.get_frame_from_pts(pts)
225
- )
226
-
227
- def get_frame_from_t(
319
+ def get_audio_frames_from_t(
228
320
  self,
229
- t: float
230
- ) -> Union[VideoFrame, AudioFrame]:
321
+ t: Union[int, float, Fraction]
322
+ ):
231
323
  """
232
- Get the frame with the given 't' time moment
233
- from the cache.
324
+ Get all the audio frames that must be
325
+ played at the 't' time moment provided.
234
326
  """
235
- pts = t_to_pts(t, self.time_base)
236
-
237
- return (
238
- self.cache[pts]
239
- if pts in self.cache else
240
- self.get_frame_from_pts(pts)
241
- )
327
+ for frame in self.get_audio_frames(t):
328
+ yield frame
242
329
 
243
- def get_frames(
330
+ def get_audio_frames(
244
331
  self,
245
- start: float = 0,
246
- end: Union[float, None] = None
332
+ start: Union[int, float, Fraction] = 0,
333
+ end: Union[int, float, Fraction, None] = None
247
334
  ):
248
335
  """
249
- Get all the frames in the range between
250
- the provided 'start' and 'end' time in
251
- seconds.
336
+ Get all the audio frames in the range
337
+ between the provided 'start' and 'end'
338
+ time (in seconds).
252
339
 
253
340
  This method is an iterator that yields
254
341
  the frame, its t and its index.
255
342
  """
256
- # We use the cache as iterator if all the frames
257
- # requested are stored there
258
- # TODO: I think this is not ok... I will never
259
- # have all the pts form here stored, as they come
260
- # from 't' that is different...
261
-
262
- """
263
- Feel free to move this explanation to other
264
- place, its about the duration.
265
-
266
- The stream 'duration' parameter is measured
267
- on ticks, the amount of ticks that the
268
- stream lasts. Here below is an example:
269
-
270
- - Duration raw: 529200
271
- - Time base: 1/44100
272
- - Duration (seconds): 12.0
273
- """
274
-
275
- # The 'duration' is on pts ticks
276
- duration = float(self.stream.duration * self.stream.time_base)
277
- print(f'duration of the whole stream: {str(duration)}s, asking for [{str(start)}, {str(end)})')
278
- # TODO: I think it would be better to
279
- # receive and work with pts instead of
280
- # 't' time moments...
281
- # pts_list = [
282
- # t_to_pts(t, self.time_base)
283
- # for t in T.get_frame_indexes(duration, self.fps, start, end)
284
- # ]
285
-
286
- # if all(
287
- # pts in self.cache
288
- # for pts in pts_list
289
- # ):
290
- # for pts in pts_list:
291
- # yield self.cache[pts]
292
-
293
- # If not all, we ignore the cache because we
294
- # need to decode and they are all consecutive
295
- start = t_to_pts(start, self.time_base)
343
+ # TODO: Is this ok? We are trying to obtain
344
+ # the audio frames for a video frame, so
345
+ # should we use the 'self.time_base' to
346
+ # truncate (?)
347
+ start = T(start, self.time_base).truncated
296
348
  end = (
297
- t_to_pts(end, self.time_base)
349
+ T(end, self.time_base).truncated
298
350
  if end is not None else
299
- None
351
+ start + (1 / self.fps)
300
352
  )
301
- key_frame_pts = self._get_nearest_keyframe_fps(start)
302
353
 
303
- # Go to the nearest key frame to start decoding
304
- self.container.seek(key_frame_pts, stream = self.stream)
354
+ key_frame_pts = self._get_nearest_keyframe_pts(start / self.time_base)
355
+
356
+ if (
357
+ self._last_packet_accessed is None or
358
+ self._last_packet_accessed.pts != key_frame_pts
359
+ ):
360
+ self._seek(key_frame_pts)
305
361
 
306
362
  for packet in self.container.demux(self.stream):
363
+ if packet.pts is None:
364
+ continue
365
+
366
+ self._last_packet_accessed = packet
367
+
307
368
  for frame in packet.decode():
308
369
  if frame.pts is None:
309
370
  continue
@@ -311,29 +372,24 @@ class VideoFrameCache:
311
372
  # We store all the frames in cache
312
373
  self._store_frame_in_cache(frame)
313
374
 
314
- print(frame)
315
- frame_end_pts = frame.pts + int(frame.samples * (1 / self.stream.sample_rate) / self.time_base)
316
- #frame_end_pts = frame.pts + int(frame.samples)
317
- #frame_end_pts = frame.pts + int(frame.samples / (self.stream.sample_rate * self.time_base))
318
- print(f' Frame from [{str(frame.pts)}, {str(frame_end_pts)}] and looking for [{str(start)}, {str(end)}]')
375
+ current_frame_time = frame.pts * self.time_base
376
+ # End is not included, its the start of the
377
+ # next frame actually
378
+ frame_end = current_frame_time + (frame.samples / self.stream.sample_rate)
319
379
 
320
380
  # For the next comments imagine we are looking
321
381
  # for the [1.0, 2.0) audio time range
322
382
  # Previous frame and nothing is inside
323
- if frame_end_pts <= start:
383
+ if frame_end <= start:
324
384
  # From 0.25 to 1.0
325
385
  continue
326
-
386
+
327
387
  # We finished, nothing is inside and its after
328
- if (
329
- end is not None and
330
- frame.pts >= end
331
- ):
388
+ if current_frame_time >= end:
332
389
  # From 2.0 to 2.75
333
390
  return
334
391
 
335
- # We need: from 1 to 2
336
- # Audio is:
392
+ # If we need audio from 1 to 2, audio is:
337
393
  # - from 0 to 0.75 (Not included, omit)
338
394
  # - from 0.5 to 1.5 (Included, take 1.0 to 1.5)
339
395
  # - from 0.5 to 2.5 (Included, take 1.0 to 2.0)
@@ -343,55 +399,46 @@ class VideoFrameCache:
343
399
 
344
400
  # Here below, at least a part is inside
345
401
  if (
346
- frame.pts < start and
347
- frame_end_pts > start
402
+ current_frame_time < start and
403
+ frame_end > start
348
404
  ):
349
405
  # A part at the end is included
350
406
  end_time = (
351
407
  # From 0.5 to 1.5 0> take 1.0 to 1.5
352
- frame_end_pts
353
- if frame_end_pts <= end else
408
+ frame_end
409
+ if frame_end <= end else
354
410
  # From 0.5 to 2.5 => take 1.0 to 2.0
355
411
  end
356
412
  )
357
- print('A part at the end is included.')
358
- # TODO: I'm using too much 'pts_to_t'
359
- frame = trim_audio_frame_pts(
413
+ #print('A part at the end is included.')
414
+ frame = trim_audio_frame(
360
415
  frame = frame,
361
- start_pts = start,
362
- end_pts = end_time,
416
+ start = start,
417
+ end = end_time,
363
418
  time_base = self.time_base
364
419
  )
365
420
  elif (
366
- frame.pts >= start and
367
- frame.pts < end
421
+ current_frame_time >= start and
422
+ current_frame_time < end
368
423
  ):
369
424
  end_time = (
370
425
  # From 1.25 to 1.5 => take 1.25 to 1.5
371
- frame_end_pts
372
- if frame_end_pts <= end else
426
+ frame_end
427
+ if frame_end <= end else
373
428
  # From 1.25 to 2.5 => take 1.25 to 2.0
374
429
  end
375
430
  )
376
431
  # A part at the begining is included
377
- print('A part at the begining is included.')
378
- # TODO: I'm using too much 'pts_to_t'
379
- frame = trim_audio_frame_pts(
432
+ #print('A part at the begining is included.')
433
+ frame = trim_audio_frame(
380
434
  frame = frame,
381
- start_pts = frame.pts,
382
- end_pts = end_time,
435
+ start = current_frame_time,
436
+ end = end_time,
383
437
  time_base = self.time_base
384
438
  )
385
439
 
386
440
  # If the whole frame is in, past as it is
387
-
388
- # TODO: Maybe send a @dataclass instead (?)
389
- # TODO: Do I really need these 't' and 'index' (?)
390
- yield (
391
- frame,
392
- pts_to_t(frame.pts, self.time_base),
393
- pts_to_index(frame.pts, self.time_base, self.fps)
394
- )
441
+ yield frame
395
442
 
396
443
  def clear(
397
444
  self
@@ -402,106 +449,64 @@ class VideoFrameCache:
402
449
  self.cache.clear()
403
450
 
404
451
  return self
405
-
406
-
407
452
 
408
- import av
409
- import numpy as np
410
-
411
- import av
412
-
413
-
414
-
415
- def trim_audio_frame_pts(
416
- frame: av.AudioFrame,
417
- start_pts: int,
418
- end_pts: int,
419
- time_base
420
- ) -> av.AudioFrame:
453
+ def trim_audio_frame(
454
+ frame: AudioFrame,
455
+ start: Union[int, float, Fraction],
456
+ end: Union[int, float, Fraction],
457
+ time_base: Fraction
458
+ ) -> AudioFrame:
421
459
  """
422
- Recorta un AudioFrame para quedarse solo con la parte entre [start_pts, end_pts] en ticks (PTS).
460
+ Trim an audio frame to obtain the part between
461
+ [start, end), that is provided in seconds.
423
462
  """
424
- samples = frame.to_ndarray() # (channels, n_samples)
425
- n_channels, n_samples = samples.shape
426
- sr = frame.sample_rate
427
-
428
- #frame_end_pts = frame.pts + int((n_samples / sr) / time_base)
429
- # TODO: This could be wrong
430
- frame_end_pts = frame.pts + int(frame.samples)
431
-
432
- # solapamiento en PTS
433
- cut_start_pts = max(frame.pts, start_pts)
434
- cut_end_pts = min(frame_end_pts, end_pts)
435
-
436
- if cut_start_pts >= cut_end_pts:
437
- raise Exception('Oops...')
438
- return None # no hay solapamiento
439
-
440
- # convertir a índices de samples (en ticks → segundos → samples)
441
- cut_start_time = (cut_start_pts - frame.pts) * time_base
442
- cut_end_time = (cut_end_pts - frame.pts) * time_base
443
-
444
- start_idx = int(cut_start_time * sr)
445
- end_idx = int(cut_end_time * sr)
446
-
447
- print(
448
- f"cutting [{frame.pts}, {frame_end_pts}] "
449
- f"to [{cut_start_pts}, {cut_end_pts}] "
450
- f"({start_idx}:{end_idx} / {frame.samples})"
451
- #f"({start_idx}:{end_idx} / {n_samples})"
452
- )
453
-
454
- cut_samples = samples[:, start_idx:end_idx]
455
-
456
- # crear nuevo AudioFrame
457
- new_frame = av.AudioFrame.from_ndarray(cut_samples, format=frame.format, layout=frame.layout)
458
- new_frame.sample_rate = sr
459
-
460
- # ajustar PTS → corresponde al inicio real del recorte
461
- new_frame.pts = cut_start_pts
462
- new_frame.time_base = time_base
463
-
464
- return new_frame
463
+ # (channels, n_samples)
464
+ samples = frame.to_ndarray()
465
+ n_samples = samples.shape[1]
465
466
 
467
+ # In seconds
468
+ frame_start = frame.pts * float(time_base)
469
+ frame_end = frame_start + (n_samples / frame.sample_rate)
466
470
 
467
-
468
- def trim_audio_frame_t(
469
- frame: av.AudioFrame,
470
- start_time: float,
471
- end_time: float,
472
- time_base
473
- ) -> av.AudioFrame:
474
- """
475
- Recorta un AudioFrame para quedarse solo con la parte entre [start_time, end_time] en segundos.
476
- """
477
- samples = frame.to_ndarray() # (channels, n_samples)
478
- n_channels, n_samples = samples.shape
479
- sr = frame.sample_rate
480
-
481
- frame_start = float(frame.pts * time_base)
482
- frame_end = frame_start + (n_samples / sr)
483
-
484
- # calcular solapamiento en segundos
485
- cut_start = max(frame_start, start_time)
486
- cut_end = min(frame_end, end_time)
471
+ # Overlapping
472
+ cut_start = max(frame_start, float(start))
473
+ cut_end = min(frame_end, float(end))
487
474
 
488
475
  if cut_start >= cut_end:
489
- return None # no hay solapamiento
476
+ # No overlapping
477
+ return None
478
+
479
+ # To sample indexes
480
+ start_index = int(round((cut_start - frame_start) * frame.sample_rate))
481
+ end_index = int(round((cut_end - frame_start) * frame.sample_rate))
482
+
483
+ new_frame = AudioFrame.from_ndarray(
484
+ # end_index is not included: so [start, end)
485
+ array = samples[:, start_index:end_index],
486
+ format = frame.format,
487
+ layout = frame.layout
488
+ )
490
489
 
491
- # convertir a índices de samples
492
- start_idx = int((cut_start - frame_start) * sr)
493
- end_idx = int((cut_end - frame_start) * sr)
490
+ # Set attributes
491
+ new_frame.sample_rate = frame.sample_rate
492
+ new_frame.time_base = time_base
493
+ new_frame.pts = int(round(cut_start / float(time_base)))
494
494
 
495
- print(f'cutting [{str(frame_start)}, {str(frame_end)}] to [{str(float(start_time))}, {str(float(end_time))}] from {str(start_idx)} to {str(end_idx)} of {str(int((frame_end - frame_start) * sr))}')
496
- cut_samples = samples[:, start_idx:end_idx]
495
+ return new_frame
497
496
 
498
- # crear nuevo AudioFrame
499
- new_frame = av.AudioFrame.from_ndarray(cut_samples, format = frame.format, layout = frame.layout)
500
- new_frame.sample_rate = sr
501
497
 
502
- # ajustar PTS → corresponde al inicio real del recorte
503
- new_pts = int(cut_start / time_base)
504
- new_frame.pts = new_pts
505
- new_frame.time_base = time_base
506
498
 
507
- return new_frame
499
+ """
500
+ There is a way of editing videos being
501
+ able to arbitrary access to frames, that
502
+ is transforming the source videos to
503
+ intra-frame videos. This is a ffmpeg
504
+ command that can do it:
505
+
506
+ - `ffmpeg -i input.mp4 -c:v libx264 -x264opts keyint=1 -preset fast -crf 18 -c:a copy output_intra.mp4`
507
+
508
+ Once you have the 'output_intra.mp4',
509
+ each packet can decodify its frame
510
+ depending not on the previous one, being
511
+ able to seek and jump easy.
512
+ """