hyper-animator-codex 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,133 @@
1
+ """节拍检测模块"""
2
+ from dataclasses import dataclass
3
+ from typing import List, Callable, Optional
4
+ from enum import Enum
5
+ import numpy as np
6
+ import librosa
7
+
8
+ from .utils import ms_to_frame
9
+
10
+
11
+ class EnergyLevel(Enum):
12
+ """节拍能量级别"""
13
+ WEAK = "weak"
14
+ MEDIUM = "medium"
15
+ STRONG = "strong"
16
+
17
+
18
+ @dataclass
19
+ class Beat:
20
+ """单个节拍"""
21
+ time_ms: int
22
+ frame: int
23
+ beat_in_bar: int
24
+ energy_level: EnergyLevel = EnergyLevel.MEDIUM
25
+
26
+
27
+ @dataclass
28
+ class BeatResult:
29
+ """节拍检测结果"""
30
+ bpm: float
31
+ beats: List[Beat]
32
+ time_signature: str = "4/4"
33
+
34
+
35
+ def detect_beats(
36
+ y: np.ndarray,
37
+ sr: int,
38
+ fps: int = 30,
39
+ on_progress: Optional[Callable[[float], None]] = None
40
+ ) -> BeatResult:
41
+ """检测节拍"""
42
+
43
+ if on_progress:
44
+ on_progress(10.0)
45
+
46
+ # 使用 librosa 检测节拍
47
+ tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
48
+ bpm = float(np.asarray(tempo).reshape(-1)[0])
49
+
50
+ if on_progress:
51
+ on_progress(40.0)
52
+
53
+ # 转换为时间(毫秒)
54
+ beat_times = librosa.frames_to_time(beat_frames, sr=sr)
55
+ beat_times_ms = [int(t * 1000) for t in beat_times]
56
+
57
+ # 计算每个beat的能量
58
+ beat_energies = _compute_beat_energies(y, sr, beat_frames)
59
+
60
+ if on_progress:
61
+ on_progress(70.0)
62
+
63
+ # 根据能量分布分级
64
+ energy_levels = _classify_energy_levels(beat_energies)
65
+
66
+ if on_progress:
67
+ on_progress(80.0)
68
+
69
+ # 构建 Beat 列表
70
+ beats = []
71
+ beats_per_bar = 4 # 假设 4/4 拍
72
+ for i, time_ms in enumerate(beat_times_ms):
73
+ frame = ms_to_frame(time_ms, fps)
74
+ beat_in_bar = (i % beats_per_bar) + 1
75
+ beats.append(Beat(
76
+ time_ms=time_ms,
77
+ frame=frame,
78
+ beat_in_bar=beat_in_bar,
79
+ energy_level=energy_levels[i]
80
+ ))
81
+
82
+ if on_progress:
83
+ on_progress(100.0)
84
+
85
+ return BeatResult(bpm=bpm, beats=beats)
86
+
87
+
88
+ def _compute_beat_energies(
89
+ y: np.ndarray,
90
+ sr: int,
91
+ beat_frames: np.ndarray,
92
+ window_ms: int = 50
93
+ ) -> np.ndarray:
94
+ """计算每个beat时刻的局部能量"""
95
+ window_samples = int(sr * window_ms / 1000)
96
+ energies = []
97
+
98
+ for frame in beat_frames:
99
+ # 将librosa帧转换为样本索引
100
+ sample = librosa.frames_to_samples(frame)
101
+ start = max(0, sample - window_samples // 2)
102
+ end = min(len(y), sample + window_samples // 2)
103
+
104
+ if end > start:
105
+ # 使用RMS作为能量度量
106
+ segment = y[start:end]
107
+ rms = np.sqrt(np.mean(segment ** 2))
108
+ energies.append(rms)
109
+ else:
110
+ energies.append(0.0)
111
+
112
+ return np.array(energies)
113
+
114
+
115
+ def _classify_energy_levels(energies: np.ndarray) -> List[EnergyLevel]:
116
+ """根据能量分布分类为弱/中/强三档"""
117
+ if len(energies) == 0:
118
+ return []
119
+
120
+ # 使用百分位数划分阈值
121
+ weak_threshold = np.percentile(energies, 33)
122
+ strong_threshold = np.percentile(energies, 66)
123
+
124
+ levels = []
125
+ for energy in energies:
126
+ if energy <= weak_threshold:
127
+ levels.append(EnergyLevel.WEAK)
128
+ elif energy >= strong_threshold:
129
+ levels.append(EnergyLevel.STRONG)
130
+ else:
131
+ levels.append(EnergyLevel.MEDIUM)
132
+
133
+ return levels
@@ -0,0 +1,74 @@
1
+ """命令行接口"""
2
+ import json
3
+ import sys
4
+ import click
5
+
6
+ from .analyzer import analyze
7
+ from .errors import (
8
+ BeatDetectorError,
9
+ FileNotFoundError,
10
+ UnsupportedFormatError,
11
+ AnalysisError,
12
+ FFmpegRequiredError
13
+ )
14
+
15
+
16
+ @click.command()
17
+ @click.argument('input_file', type=click.Path(exists=False))
18
+ @click.option('-o', '--output', type=click.Path(), help='Output file path')
19
+ @click.option('--fps', default=30, help='Frame rate (default: 30)')
20
+ @click.option('--log-level', default='info',
21
+ type=click.Choice(['debug', 'info', 'warning', 'error']),
22
+ help='Log level (default: info)')
23
+ @click.option('--pretty', is_flag=True, help='Pretty print JSON')
24
+ def main(input_file: str, output: str, fps: int, log_level: str, pretty: bool) -> None:
25
+ """
26
+ Music beat detector CLI
27
+
28
+ Analyze audio file and output beat detection results in JSON format.
29
+
30
+ Supported formats (no extra dependencies):
31
+ .wav, .flac, .ogg
32
+
33
+ Formats requiring ffmpeg:
34
+ .mp3, .m4a, .aac, .wma
35
+
36
+ Usage:
37
+ beat-detector input.mp3 -o output.json
38
+ beat-detector input.wav --pretty
39
+ """
40
+ try:
41
+ result = analyze(input_file, fps=fps, log_level=log_level)
42
+ json_output = result.to_json(pretty=pretty)
43
+
44
+ if output:
45
+ result.save(output, pretty=pretty)
46
+ click.echo(f"Output saved to: {output}")
47
+ else:
48
+ click.echo(json_output)
49
+
50
+ sys.exit(0)
51
+
52
+ except FileNotFoundError as e:
53
+ click.echo(f"Error: {e.message}", err=True)
54
+ sys.exit(1)
55
+
56
+ except UnsupportedFormatError as e:
57
+ click.echo(f"Error: {e.message}", err=True)
58
+ sys.exit(2)
59
+
60
+ except FFmpegRequiredError as e:
61
+ click.echo(f"Error: {e.message}", err=True)
62
+ sys.exit(4)
63
+
64
+ except AnalysisError as e:
65
+ click.echo(f"Error: {e.message}", err=True)
66
+ sys.exit(3)
67
+
68
+ except Exception as e:
69
+ click.echo(f"Unexpected error: {str(e)}", err=True)
70
+ sys.exit(3)
71
+
72
+
73
+ if __name__ == '__main__':
74
+ main()
@@ -0,0 +1,49 @@
1
+ """错误类型"""
2
+ from typing import Optional
3
+
4
+
5
+ class BeatDetectorError(Exception):
6
+ """基础异常类"""
7
+
8
+ def __init__(self, message: str, code: int = 1):
9
+ self.message = message
10
+ self.code = code
11
+ super().__init__(self.message)
12
+
13
+
14
+ class FileNotFoundError(BeatDetectorError):
15
+ """文件不存在"""
16
+
17
+ def __init__(self, path: str):
18
+ super().__init__(f"File not found: {path}", code=1)
19
+
20
+
21
+ class UnsupportedFormatError(BeatDetectorError):
22
+ """不支持的音频格式"""
23
+
24
+ def __init__(self, format: str):
25
+ super().__init__(f"Unsupported audio format: {format}", code=2)
26
+
27
+
28
+ class FFmpegRequiredError(BeatDetectorError):
29
+ """需要安装 ffmpeg"""
30
+
31
+ def __init__(self, format: str):
32
+ message = (
33
+ f"Format '{format}' requires ffmpeg. "
34
+ f"Please install ffmpeg:\n"
35
+ f" Ubuntu/Debian: sudo apt install ffmpeg\n"
36
+ f" macOS: brew install ffmpeg\n"
37
+ f" Windows: choco install ffmpeg\n"
38
+ f"\n"
39
+ f"Or use native formats: .wav, .flac, .ogg"
40
+ )
41
+ super().__init__(message, code=4)
42
+
43
+
44
+ class AnalysisError(BeatDetectorError):
45
+ """分析失败"""
46
+
47
+ def __init__(self, message: str, cause: Optional[Exception] = None):
48
+ self.cause = cause
49
+ super().__init__(message, code=3)
@@ -0,0 +1,171 @@
1
+ """结构检测模块"""
2
+ from dataclasses import dataclass
3
+ from typing import List, Callable, Optional
4
+ import numpy as np
5
+ import librosa
6
+
7
+ from .utils import ms_to_frame
8
+
9
+
10
+ @dataclass
11
+ class Segment:
12
+ """音乐段落"""
13
+ type: str
14
+ start_ms: int
15
+ end_ms: int
16
+ start_frame: int
17
+ end_frame: int
18
+ confidence: float
19
+
20
+
21
+ @dataclass
22
+ class EnergyPeak:
23
+ """能量高峰"""
24
+ time_ms: int
25
+ frame: int
26
+ intensity: float
27
+
28
+
29
+ @dataclass
30
+ class SilenceRegion:
31
+ """静音区域"""
32
+ start_ms: int
33
+ end_ms: int
34
+ start_frame: int
35
+ end_frame: int
36
+
37
+
38
+ @dataclass
39
+ class StructureResult:
40
+ """结构检测结果"""
41
+ segments: List[Segment]
42
+ energy_peaks: List[EnergyPeak]
43
+ silence_regions: List[SilenceRegion]
44
+
45
+
46
+ def detect_structure(
47
+ y: np.ndarray,
48
+ sr: int,
49
+ fps: int = 30,
50
+ on_progress: Optional[Callable[[float], None]] = None
51
+ ) -> StructureResult:
52
+ """检测音乐结构"""
53
+
54
+ if on_progress:
55
+ on_progress(10.0)
56
+
57
+ # 计算能量
58
+ rms = librosa.feature.rms(y=y)[0]
59
+ rms_frames = librosa.frames_to_time(range(len(rms)), sr=sr)
60
+
61
+ if on_progress:
62
+ on_progress(30.0)
63
+
64
+ # 检测能量高峰
65
+ energy_peaks = _detect_energy_peaks(rms, rms_frames, fps)
66
+
67
+ if on_progress:
68
+ on_progress(50.0)
69
+
70
+ # 检测静音区域
71
+ silence_regions = _detect_silence_regions(rms, rms_frames, fps, sr)
72
+
73
+ if on_progress:
74
+ on_progress(70.0)
75
+
76
+ # 简单的段落检测(基于能量变化)
77
+ segments = _detect_segments(y, sr, rms, rms_frames, fps)
78
+
79
+ if on_progress:
80
+ on_progress(100.0)
81
+
82
+ return StructureResult(
83
+ segments=segments,
84
+ energy_peaks=energy_peaks,
85
+ silence_regions=silence_regions
86
+ )
87
+
88
+
89
+ def _detect_energy_peaks(rms: np.ndarray, times: np.ndarray, fps: int) -> List[EnergyPeak]:
90
+ """检测能量高峰"""
91
+ peaks = []
92
+ threshold = np.mean(rms) + np.std(rms)
93
+
94
+ for i in range(1, len(rms) - 1):
95
+ if rms[i] > threshold and rms[i] > rms[i-1] and rms[i] > rms[i+1]:
96
+ time_ms = int(times[i] * 1000)
97
+ frame = ms_to_frame(time_ms, fps)
98
+ intensity = float(rms[i] / np.max(rms)) if np.max(rms) > 0 else 0.0
99
+ peaks.append(EnergyPeak(
100
+ time_ms=time_ms,
101
+ frame=frame,
102
+ intensity=intensity
103
+ ))
104
+
105
+ return peaks
106
+
107
+
108
+ def _detect_silence_regions(rms: np.ndarray, times: np.ndarray, fps: int, sr: int) -> List[SilenceRegion]:
109
+ """检测静音区域"""
110
+ regions = []
111
+ threshold = np.mean(rms) * 0.1 # 静音阈值
112
+ min_silence_duration = 0.5 # 最小静音时长(秒)
113
+
114
+ in_silence = False
115
+ silence_start = 0
116
+
117
+ for i, energy in enumerate(rms):
118
+ if energy < threshold:
119
+ if not in_silence:
120
+ in_silence = True
121
+ silence_start = i
122
+ else:
123
+ if in_silence:
124
+ silence_duration = times[i] - times[silence_start]
125
+ if silence_duration >= min_silence_duration:
126
+ start_ms = int(times[silence_start] * 1000)
127
+ end_ms = int(times[i] * 1000)
128
+ regions.append(SilenceRegion(
129
+ start_ms=start_ms,
130
+ end_ms=end_ms,
131
+ start_frame=ms_to_frame(start_ms, fps),
132
+ end_frame=ms_to_frame(end_ms, fps)
133
+ ))
134
+ in_silence = False
135
+
136
+ return regions
137
+
138
+
139
+ def _detect_segments(y: np.ndarray, sr: int, rms: np.ndarray, times: np.ndarray, fps: int) -> List[Segment]:
140
+ """简单的段落检测"""
141
+ segments = []
142
+ duration_ms = int(len(y) / sr * 1000)
143
+
144
+ # 简单策略:将音频分成 intro, main, outro
145
+ num_sections = 3
146
+ section_duration = duration_ms // num_sections
147
+
148
+ # 计算每个部分的平均能量
149
+ rms_per_section = np.array_split(rms, num_sections)
150
+ avg_energies = [np.mean(s) for s in rms_per_section]
151
+
152
+ section_types = ['intro', 'verse', 'outro']
153
+
154
+ for i in range(num_sections):
155
+ start_ms = i * section_duration
156
+ end_ms = (i + 1) * section_duration if i < num_sections - 1 else duration_ms
157
+
158
+ # 根据相对能量计算置信度
159
+ max_energy = max(avg_energies) if max(avg_energies) > 0 else 1
160
+ confidence = float(avg_energies[i] / max_energy)
161
+
162
+ segments.append(Segment(
163
+ type=section_types[i],
164
+ start_ms=start_ms,
165
+ end_ms=end_ms,
166
+ start_frame=ms_to_frame(start_ms, fps),
167
+ end_frame=ms_to_frame(end_ms, fps),
168
+ confidence=confidence
169
+ ))
170
+
171
+ return segments
@@ -0,0 +1,73 @@
1
+ """工具函数"""
2
+ from typing import Tuple
3
+ import os
4
+ import logging
5
+ import numpy as np
6
+
7
+ from .errors import FileNotFoundError, UnsupportedFormatError, FFmpegRequiredError
8
+
9
+ # 原生支持的格式(无需额外依赖)
10
+ NATIVE_FORMATS = {'.wav', '.flac', '.ogg'}
11
+
12
+ # 需要 ffmpeg 的格式
13
+ FFMPEG_FORMATS = {'.mp3', '.m4a', '.aac', '.wma'}
14
+
15
+ # 所有支持的格式
16
+ SUPPORTED_FORMATS = NATIVE_FORMATS | FFMPEG_FORMATS
17
+
18
+
19
+ def ms_to_frame(ms: int, fps: int) -> int:
20
+ """毫秒转帧号"""
21
+ return int(ms * fps / 1000)
22
+
23
+
24
+ def frame_to_ms(frame: int, fps: int) -> int:
25
+ """帧号转毫秒"""
26
+ return int(frame * 1000 / fps)
27
+
28
+
29
+ def check_ffmpeg_available() -> bool:
30
+ """检查 ffmpeg 是否可用"""
31
+ import shutil
32
+ return shutil.which('ffmpeg') is not None
33
+
34
+
35
+ def load_audio(file_path: str) -> Tuple[np.ndarray, int]:
36
+ """加载音频文件
37
+
38
+ 原生支持: .wav, .flac, .ogg (无需额外依赖)
39
+ 需要 ffmpeg: .mp3, .m4a, .aac, .wma
40
+ """
41
+ import librosa
42
+
43
+ # 检查文件是否存在
44
+ if not os.path.exists(file_path):
45
+ raise FileNotFoundError(file_path)
46
+
47
+ # 检查格式
48
+ ext = os.path.splitext(file_path)[1].lower()
49
+ if ext not in SUPPORTED_FORMATS:
50
+ raise UnsupportedFormatError(ext)
51
+
52
+ # 检查是否需要 ffmpeg
53
+ if ext in FFMPEG_FORMATS and not check_ffmpeg_available():
54
+ raise FFmpegRequiredError(ext)
55
+
56
+ # 加载音频
57
+ y, sr = librosa.load(file_path, sr=None)
58
+ return y, sr
59
+
60
+
61
+ def setup_logging(level: str = "info") -> None:
62
+ """配置日志"""
63
+ level_map = {
64
+ "debug": logging.DEBUG,
65
+ "info": logging.INFO,
66
+ "warning": logging.WARNING,
67
+ "error": logging.ERROR,
68
+ }
69
+ logging.basicConfig(
70
+ level=level_map.get(level, logging.INFO),
71
+ format="[%(levelname)s] %(message)s",
72
+ force=True
73
+ )