hyper-animator-codex 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -0
- package/package.json +1 -1
- package/skills/hyper-animator-codex/SKILL.md +13 -8
- package/skills/hyper-animator-codex/references/beat-sync-workflow.md +51 -0
- package/skills/hyper-animator-codex/scripts/analyze_music_beats.py +78 -0
- package/skills/hyper-animator-codex/vendor/music-beat-detector/README.md +13 -0
- package/skills/hyper-animator-codex/vendor/music-beat-detector/beat_detector/__init__.py +33 -0
- package/skills/hyper-animator-codex/vendor/music-beat-detector/beat_detector/analyzer.py +129 -0
- package/skills/hyper-animator-codex/vendor/music-beat-detector/beat_detector/beat.py +133 -0
- package/skills/hyper-animator-codex/vendor/music-beat-detector/beat_detector/cli.py +74 -0
- package/skills/hyper-animator-codex/vendor/music-beat-detector/beat_detector/errors.py +49 -0
- package/skills/hyper-animator-codex/vendor/music-beat-detector/beat_detector/structure.py +171 -0
- package/skills/hyper-animator-codex/vendor/music-beat-detector/beat_detector/utils.py +73 -0
package/README.md
CHANGED
|
@@ -43,6 +43,24 @@ npx hyper-animator-codex install --target /path/to/codex/skills
|
|
|
43
43
|
- `skills/hyper-animator-codex/references/`: HyperFrames catalog map, workflow guide, pseudocode, and request examples.
|
|
44
44
|
- `skills/hyper-animator-codex/scripts/validate_hyperframes_html.py`: static pre-render HTML quality gate.
|
|
45
45
|
|
|
46
|
+
## Optional Beat Detection
|
|
47
|
+
|
|
48
|
+
The skill can analyze background music and guide Codex to align HyperFrames/GSAP transitions to beats, bars, energy peaks, and detected music segments.
|
|
49
|
+
|
|
50
|
+
Install optional Python dependencies:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
python3 -m pip install librosa pydub numpy click
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Analyze WAV/FLAC/OGG without ffmpeg. MP3/M4A/AAC/WMA require system `ffmpeg`.
|
|
57
|
+
|
|
58
|
+
Run the bundled analyzer from an installed skill directory or repository checkout:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
python3 skills/hyper-animator-codex/scripts/analyze_music_beats.py path/to/music.wav -o beat-map.json --fps 60 --pretty
|
|
62
|
+
```
|
|
63
|
+
|
|
46
64
|
## Development
|
|
47
65
|
|
|
48
66
|
```bash
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: hyper-animator-codex
|
|
3
|
-
description: Use when a user asks Codex to create, plan, author, customize, validate, preview, or render a HyperFrames, HTML, or GSAP animation/video from natural-language requirements, including product demos, code demos, data videos, podcast captions, social shorts, catalog
|
|
3
|
+
description: Use when a user asks Codex to create, plan, author, customize, validate, preview, or render a HyperFrames, HTML, or GSAP animation/video from natural-language requirements, including product demos, code demos, data videos, podcast captions, social shorts, catalog assembly, new HyperFrames HTML, sound effects, background music, rhythm, BPM, or beat-synced transitions.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
6
|
# Hyper Animator Codex
|
|
@@ -18,11 +18,13 @@ Turn a natural-language animation or video brief into a validated HyperFrames HT
|
|
|
18
18
|
5. Decide generation mode:
|
|
19
19
|
- `generate_new_hyperframes_html` when the user asks to write HTML, create a new effect, customize style, match a brand, use complex animation, or when component snippets are only paste placeholders.
|
|
20
20
|
- `assemble_existing_catalog_items` only when the user asks to use existing catalog items or quickly compose installed blocks/components.
|
|
21
|
-
6.
|
|
22
|
-
7.
|
|
23
|
-
8.
|
|
24
|
-
9.
|
|
25
|
-
10.
|
|
21
|
+
6. Clarify audio: ask whether to add animation/transition sound effects and whether to add background music.
|
|
22
|
+
7. If background music is used, read `references/beat-sync-workflow.md`, ask for the local audio path, and run `scripts/analyze_music_beats.py` when the file is available.
|
|
23
|
+
8. Ask the second clarification round with candidate context: visual direction, motion rhythm, generation mode, audio choices, and any candidate tradeoffs.
|
|
24
|
+
9. Write or assemble HTML. When beat-sync is enabled, align major reveals, cuts, transitions, camera moves, and visual accents to the beat map instead of arbitrary timestamps.
|
|
25
|
+
10. Run pre-render quality gates.
|
|
26
|
+
11. Show a concise plan summary and preview path or HTML file to the user. Ask for confirmation before video render.
|
|
27
|
+
12. Render only after user confirmation, then report output path and any caveats.
|
|
26
28
|
|
|
27
29
|
## Interactive Questions
|
|
28
30
|
|
|
@@ -30,8 +32,8 @@ Prefer Codex interactive question tools such as `AskUserQuestion` or `request_us
|
|
|
30
32
|
|
|
31
33
|
Use two rounds:
|
|
32
34
|
|
|
33
|
-
- Round 1: purpose, format, duration, platform, required content, brand assets, and whether video render is expected in this turn.
|
|
34
|
-
- Round 2: after catalog scoring, ask about style, motion, candidate selection, and
|
|
35
|
+
- Round 1: purpose, format, duration, platform, required content, brand assets, sound effects, background music, and whether video render is expected in this turn.
|
|
36
|
+
- Round 2: after catalog scoring, ask about style, motion, candidate selection, generation mode, and beat-sync assumptions when background music is present.
|
|
35
37
|
|
|
36
38
|
Do not ask everything upfront when the brief is already specific. Ask only for missing or decision-changing information.
|
|
37
39
|
|
|
@@ -40,6 +42,7 @@ Do not ask everything upfront when the brief is already specific. Ask only for m
|
|
|
40
42
|
- Read `references/hyperframes-intent-workflow.md` for the full AskUserQuestion workflow, generation-mode rules, scoring model, and render confirmation requirements.
|
|
41
43
|
- Read `references/hyperframes-catalog-map.json` whenever selecting catalog candidates or determining visual references.
|
|
42
44
|
- Read `references/hyperframes-agent-pseudocode.ts` when implementing the end-to-end loop or when the correct sequence is ambiguous.
|
|
45
|
+
- Read `references/beat-sync-workflow.md` when sound effects, background music, soundtrack, beat sync, rhythm, BPM, audio-reactive animation, or transition timing to music is mentioned.
|
|
43
46
|
- Use `references/examples/*.json` for sanity checks against common request shapes.
|
|
44
47
|
|
|
45
48
|
## Catalog Rules
|
|
@@ -79,6 +82,8 @@ Before rendering, summarize:
|
|
|
79
82
|
|
|
80
83
|
- generation mode;
|
|
81
84
|
- selected or referenced catalog items;
|
|
85
|
+
- sound effects and background music choices;
|
|
86
|
+
- beat map path, BPM, duration, and timing assumptions when beat-sync is enabled;
|
|
82
87
|
- dimensions and duration;
|
|
83
88
|
- content assumptions;
|
|
84
89
|
- preview location;
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Beat Sync Workflow
|
|
2
|
+
|
|
3
|
+
Use this when the user mentions sound effects, background music, soundtrack, beat sync, rhythm, BPM, transitions to music, or audio-reactive animation.
|
|
4
|
+
|
|
5
|
+
## Clarify Audio
|
|
6
|
+
|
|
7
|
+
Ask:
|
|
8
|
+
|
|
9
|
+
- Should animation or transition sound effects be added?
|
|
10
|
+
- Should background music be added?
|
|
11
|
+
- If background music is used, what is the local audio path?
|
|
12
|
+
- May scene timing, transition timing, and animation accents be adjusted to match the music?
|
|
13
|
+
- Should the music be trimmed, looped, or should video duration follow the selected music region?
|
|
14
|
+
|
|
15
|
+
## Analyze Music
|
|
16
|
+
|
|
17
|
+
When a background music file is provided:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
python3 scripts/analyze_music_beats.py path/to/music.wav -o path/to/beat-map.json --fps 60 --pretty
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
If dependencies are missing, tell the user:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
python3 -m pip install librosa pydub numpy click
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
MP3/M4A/AAC/WMA require system `ffmpeg`; WAV/FLAC/OGG are safer.
|
|
30
|
+
|
|
31
|
+
## Map Beats To Animation
|
|
32
|
+
|
|
33
|
+
- Use `meta.bpm` for global pacing.
|
|
34
|
+
- Use beats with `beat_in_bar: 1` for major scene changes, title reveals, camera moves, and transitions.
|
|
35
|
+
- Use `energy_level: strong` for visual accents, scale pulses, light sweeps, flashes, or cut emphasis.
|
|
36
|
+
- Use weak beats for secondary motion only.
|
|
37
|
+
- Use `structure.segments` to shape intro, main body, and outro.
|
|
38
|
+
- Use `structure.energy_peaks` for high-impact moments.
|
|
39
|
+
- Keep `data-duration` consistent with the selected music region or state trim/loop assumptions.
|
|
40
|
+
|
|
41
|
+
## GSAP Timing
|
|
42
|
+
|
|
43
|
+
Convert beat times from milliseconds to seconds:
|
|
44
|
+
|
|
45
|
+
```js
|
|
46
|
+
const beatSeconds = beatMap.beats.map((beat) => beat.time_ms / 1000);
|
|
47
|
+
tl.addLabel("downbeat_1", beatSeconds[0]);
|
|
48
|
+
tl.to(".hero", { opacity: 1, y: 0, duration: 0.35 }, "downbeat_1");
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Do not use wall-clock audio playback to drive render progress. The render timeline remains a paused GSAP timeline registered in `window.__timelines`.
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
sys.dont_write_bytecode = True
|
|
9
|
+
|
|
10
|
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
11
|
+
VENDOR_ROOT = SCRIPT_DIR.parent / "vendor" / "music-beat-detector"
|
|
12
|
+
sys.path.insert(0, str(VENDOR_ROOT))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse_args() -> argparse.Namespace:
|
|
16
|
+
parser = argparse.ArgumentParser(
|
|
17
|
+
description="Analyze background music and write a beat map JSON for HyperFrames timing."
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument("audio_file", help="Path to WAV, FLAC, OGG, MP3, M4A, AAC, or WMA audio")
|
|
20
|
+
parser.add_argument("-o", "--output", help="Output JSON path. Prints JSON to stdout when omitted.")
|
|
21
|
+
parser.add_argument("--fps", type=int, default=60, help="Timeline frame rate for frame indexes. Default: 60")
|
|
22
|
+
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON")
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"--log-level",
|
|
25
|
+
default="error",
|
|
26
|
+
choices=["debug", "info", "warning", "error"],
|
|
27
|
+
help="Detector log level",
|
|
28
|
+
)
|
|
29
|
+
return parser.parse_args()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def main() -> int:
|
|
33
|
+
args = parse_args()
|
|
34
|
+
audio_path = Path(args.audio_file)
|
|
35
|
+
if not audio_path.exists():
|
|
36
|
+
print(f"Error: audio file does not exist: {audio_path}", file=sys.stderr)
|
|
37
|
+
return 1
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
from beat_detector import analyze
|
|
41
|
+
from beat_detector.errors import BeatDetectorError, FFmpegRequiredError
|
|
42
|
+
except ModuleNotFoundError as exc:
|
|
43
|
+
print(
|
|
44
|
+
"Error: missing Python dependency for beat detection. "
|
|
45
|
+
"Install optional dependencies with: python3 -m pip install librosa pydub numpy click",
|
|
46
|
+
file=sys.stderr,
|
|
47
|
+
)
|
|
48
|
+
print(f"Missing module: {exc.name}", file=sys.stderr)
|
|
49
|
+
return 10
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
result = analyze(str(audio_path), fps=args.fps, log_level=args.log_level)
|
|
53
|
+
if args.output:
|
|
54
|
+
result.save(args.output, pretty=args.pretty)
|
|
55
|
+
print(f"Beat map saved to: {args.output}")
|
|
56
|
+
else:
|
|
57
|
+
print(result.to_json(pretty=args.pretty))
|
|
58
|
+
return 0
|
|
59
|
+
except FFmpegRequiredError as exc:
|
|
60
|
+
print(f"Error: {exc}", file=sys.stderr)
|
|
61
|
+
print("Install ffmpeg or use WAV/FLAC/OGG audio for fewer runtime dependencies.", file=sys.stderr)
|
|
62
|
+
return 4
|
|
63
|
+
except BeatDetectorError as exc:
|
|
64
|
+
message = getattr(exc, "message", str(exc))
|
|
65
|
+
print(f"Error: {message}", file=sys.stderr)
|
|
66
|
+
return 3
|
|
67
|
+
except ModuleNotFoundError as exc:
|
|
68
|
+
print(
|
|
69
|
+
"Error: missing Python dependency for beat detection. "
|
|
70
|
+
"Install optional dependencies with: python3 -m pip install librosa pydub numpy click",
|
|
71
|
+
file=sys.stderr,
|
|
72
|
+
)
|
|
73
|
+
print(f"Missing module: {exc.name}", file=sys.stderr)
|
|
74
|
+
return 10
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if __name__ == "__main__":
|
|
78
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Vendored music-beat-detector
|
|
2
|
+
|
|
3
|
+
Trimmed source from `git@github.com:realpkuasule/music-beat-detector.git`, commit `29b081fbe3bb38f0fa8cb569fa3150d7cfdb18cb`.
|
|
4
|
+
|
|
5
|
+
Only `beat_detector/` is vendored. Sample media, ffmpeg archives, development contracts, and upstream tests are intentionally omitted from the npm package.
|
|
6
|
+
|
|
7
|
+
Optional runtime dependencies:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
python3 -m pip install librosa pydub numpy click
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
MP3/M4A/AAC/WMA files require system `ffmpeg`; WAV/FLAC/OGG are preferred for fewer dependencies.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Music Beat Detector - 音乐自动打点工具"""
|
|
2
|
+
|
|
3
|
+
from .analyzer import analyze, AnalysisResult, MetaInfo
|
|
4
|
+
from .beat import Beat, BeatResult, EnergyLevel
|
|
5
|
+
from .structure import Segment, EnergyPeak, SilenceRegion, StructureResult
|
|
6
|
+
from .errors import BeatDetectorError, FileNotFoundError, UnsupportedFormatError, AnalysisError, FFmpegRequiredError
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
# 主要 API
|
|
12
|
+
'analyze',
|
|
13
|
+
'AnalysisResult',
|
|
14
|
+
'MetaInfo',
|
|
15
|
+
|
|
16
|
+
# 节拍检测
|
|
17
|
+
'Beat',
|
|
18
|
+
'BeatResult',
|
|
19
|
+
'EnergyLevel',
|
|
20
|
+
|
|
21
|
+
# 结构检测
|
|
22
|
+
'StructureResult',
|
|
23
|
+
'Segment',
|
|
24
|
+
'EnergyPeak',
|
|
25
|
+
'SilenceRegion',
|
|
26
|
+
|
|
27
|
+
# 异常
|
|
28
|
+
'BeatDetectorError',
|
|
29
|
+
'FileNotFoundError',
|
|
30
|
+
'UnsupportedFormatError',
|
|
31
|
+
'FFmpegRequiredError',
|
|
32
|
+
'AnalysisError',
|
|
33
|
+
]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""分析器模块"""
|
|
2
|
+
from dataclasses import dataclass, field, asdict
|
|
3
|
+
from typing import List, Callable, Optional, Any
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from .beat import Beat, BeatResult, detect_beats
|
|
9
|
+
from .structure import StructureResult, detect_structure
|
|
10
|
+
from .utils import load_audio, setup_logging
|
|
11
|
+
from .errors import (
|
|
12
|
+
FileNotFoundError as BeatDetectorFileNotFoundError,
|
|
13
|
+
FFmpegRequiredError,
|
|
14
|
+
UnsupportedFormatError
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class MetaInfo:
|
|
20
|
+
"""元信息"""
|
|
21
|
+
file: str
|
|
22
|
+
duration_ms: int
|
|
23
|
+
sample_rate: int
|
|
24
|
+
bpm: float
|
|
25
|
+
time_signature: str = "4/4"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class AnalysisResult:
|
|
30
|
+
"""完整分析结果"""
|
|
31
|
+
meta: MetaInfo
|
|
32
|
+
beats: List[Beat] = field(default_factory=list)
|
|
33
|
+
structure: Optional[StructureResult] = None
|
|
34
|
+
|
|
35
|
+
def to_dict(self) -> dict:
|
|
36
|
+
"""转换为字典"""
|
|
37
|
+
result = {
|
|
38
|
+
'meta': asdict(self.meta),
|
|
39
|
+
'beats': [
|
|
40
|
+
{
|
|
41
|
+
'time_ms': beat.time_ms,
|
|
42
|
+
'frame': beat.frame,
|
|
43
|
+
'beat_in_bar': beat.beat_in_bar,
|
|
44
|
+
'energy_level': beat.energy_level.value
|
|
45
|
+
}
|
|
46
|
+
for beat in self.beats
|
|
47
|
+
],
|
|
48
|
+
'structure': {
|
|
49
|
+
'segments': [asdict(seg) for seg in self.structure.segments],
|
|
50
|
+
'energy_peaks': [asdict(peak) for peak in self.structure.energy_peaks],
|
|
51
|
+
'silence_regions': [asdict(region) for region in self.structure.silence_regions],
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return result
|
|
55
|
+
|
|
56
|
+
def to_json(self, pretty: bool = False) -> str:
|
|
57
|
+
"""导出为 JSON"""
|
|
58
|
+
indent = 2 if pretty else None
|
|
59
|
+
return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
|
|
60
|
+
|
|
61
|
+
def save(self, path: str, pretty: bool = False) -> None:
|
|
62
|
+
"""保存到文件"""
|
|
63
|
+
with open(path, 'w', encoding='utf-8') as f:
|
|
64
|
+
f.write(self.to_json(pretty=pretty))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def analyze(
|
|
68
|
+
file_path: str,
|
|
69
|
+
fps: int = 30,
|
|
70
|
+
log_level: str = "info",
|
|
71
|
+
on_progress: Optional[Callable[[float], None]] = None
|
|
72
|
+
) -> AnalysisResult:
|
|
73
|
+
"""分析音频文件"""
|
|
74
|
+
|
|
75
|
+
# 设置日志
|
|
76
|
+
setup_logging(log_level)
|
|
77
|
+
logger = logging.getLogger(__name__)
|
|
78
|
+
|
|
79
|
+
# 进度辅助函数
|
|
80
|
+
def report_progress(stage_progress: float):
|
|
81
|
+
if on_progress:
|
|
82
|
+
on_progress(stage_progress)
|
|
83
|
+
|
|
84
|
+
# 加载音频
|
|
85
|
+
logger.info(f"Loading audio file: {file_path}")
|
|
86
|
+
try:
|
|
87
|
+
y, sr = load_audio(file_path)
|
|
88
|
+
except (BeatDetectorFileNotFoundError, FFmpegRequiredError, UnsupportedFormatError):
|
|
89
|
+
raise
|
|
90
|
+
except Exception as e:
|
|
91
|
+
raise BeatDetectorFileNotFoundError(file_path)
|
|
92
|
+
|
|
93
|
+
report_progress(10.0)
|
|
94
|
+
|
|
95
|
+
# 计算时长
|
|
96
|
+
duration_ms = int(len(y) / sr * 1000)
|
|
97
|
+
logger.info(f"Audio duration: {duration_ms/1000:.1f}s, sample rate: {sr}Hz")
|
|
98
|
+
|
|
99
|
+
# 节拍检测
|
|
100
|
+
logger.info("Detecting beats...")
|
|
101
|
+
beat_result = detect_beats(y, sr, fps=fps, on_progress=lambda p: report_progress(10 + p * 0.45))
|
|
102
|
+
logger.info(f"Detected BPM: {beat_result.bpm:.1f}")
|
|
103
|
+
|
|
104
|
+
report_progress(55.0)
|
|
105
|
+
|
|
106
|
+
# 结构检测
|
|
107
|
+
logger.info("Analyzing structure...")
|
|
108
|
+
structure_result = detect_structure(y, sr, fps=fps, on_progress=lambda p: report_progress(55 + p * 0.45))
|
|
109
|
+
logger.info(f"Found {len(structure_result.segments)} segments")
|
|
110
|
+
|
|
111
|
+
report_progress(100.0)
|
|
112
|
+
|
|
113
|
+
# 构建结果
|
|
114
|
+
meta = MetaInfo(
|
|
115
|
+
file=file_path,
|
|
116
|
+
duration_ms=duration_ms,
|
|
117
|
+
sample_rate=sr,
|
|
118
|
+
bpm=beat_result.bpm,
|
|
119
|
+
time_signature=beat_result.time_signature
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
result = AnalysisResult(
|
|
123
|
+
meta=meta,
|
|
124
|
+
beats=beat_result.beats,
|
|
125
|
+
structure=structure_result
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
logger.info("Analysis complete")
|
|
129
|
+
return result
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""节拍检测模块"""
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import List, Callable, Optional
|
|
4
|
+
from enum import Enum
|
|
5
|
+
import numpy as np
|
|
6
|
+
import librosa
|
|
7
|
+
|
|
8
|
+
from .utils import ms_to_frame
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EnergyLevel(Enum):
|
|
12
|
+
"""节拍能量级别"""
|
|
13
|
+
WEAK = "weak"
|
|
14
|
+
MEDIUM = "medium"
|
|
15
|
+
STRONG = "strong"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Beat:
|
|
20
|
+
"""单个节拍"""
|
|
21
|
+
time_ms: int
|
|
22
|
+
frame: int
|
|
23
|
+
beat_in_bar: int
|
|
24
|
+
energy_level: EnergyLevel = EnergyLevel.MEDIUM
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class BeatResult:
|
|
29
|
+
"""节拍检测结果"""
|
|
30
|
+
bpm: float
|
|
31
|
+
beats: List[Beat]
|
|
32
|
+
time_signature: str = "4/4"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def detect_beats(
|
|
36
|
+
y: np.ndarray,
|
|
37
|
+
sr: int,
|
|
38
|
+
fps: int = 30,
|
|
39
|
+
on_progress: Optional[Callable[[float], None]] = None
|
|
40
|
+
) -> BeatResult:
|
|
41
|
+
"""检测节拍"""
|
|
42
|
+
|
|
43
|
+
if on_progress:
|
|
44
|
+
on_progress(10.0)
|
|
45
|
+
|
|
46
|
+
# 使用 librosa 检测节拍
|
|
47
|
+
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
|
|
48
|
+
bpm = float(np.asarray(tempo).reshape(-1)[0])
|
|
49
|
+
|
|
50
|
+
if on_progress:
|
|
51
|
+
on_progress(40.0)
|
|
52
|
+
|
|
53
|
+
# 转换为时间(毫秒)
|
|
54
|
+
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
|
|
55
|
+
beat_times_ms = [int(t * 1000) for t in beat_times]
|
|
56
|
+
|
|
57
|
+
# 计算每个beat的能量
|
|
58
|
+
beat_energies = _compute_beat_energies(y, sr, beat_frames)
|
|
59
|
+
|
|
60
|
+
if on_progress:
|
|
61
|
+
on_progress(70.0)
|
|
62
|
+
|
|
63
|
+
# 根据能量分布分级
|
|
64
|
+
energy_levels = _classify_energy_levels(beat_energies)
|
|
65
|
+
|
|
66
|
+
if on_progress:
|
|
67
|
+
on_progress(80.0)
|
|
68
|
+
|
|
69
|
+
# 构建 Beat 列表
|
|
70
|
+
beats = []
|
|
71
|
+
beats_per_bar = 4 # 假设 4/4 拍
|
|
72
|
+
for i, time_ms in enumerate(beat_times_ms):
|
|
73
|
+
frame = ms_to_frame(time_ms, fps)
|
|
74
|
+
beat_in_bar = (i % beats_per_bar) + 1
|
|
75
|
+
beats.append(Beat(
|
|
76
|
+
time_ms=time_ms,
|
|
77
|
+
frame=frame,
|
|
78
|
+
beat_in_bar=beat_in_bar,
|
|
79
|
+
energy_level=energy_levels[i]
|
|
80
|
+
))
|
|
81
|
+
|
|
82
|
+
if on_progress:
|
|
83
|
+
on_progress(100.0)
|
|
84
|
+
|
|
85
|
+
return BeatResult(bpm=bpm, beats=beats)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _compute_beat_energies(
|
|
89
|
+
y: np.ndarray,
|
|
90
|
+
sr: int,
|
|
91
|
+
beat_frames: np.ndarray,
|
|
92
|
+
window_ms: int = 50
|
|
93
|
+
) -> np.ndarray:
|
|
94
|
+
"""计算每个beat时刻的局部能量"""
|
|
95
|
+
window_samples = int(sr * window_ms / 1000)
|
|
96
|
+
energies = []
|
|
97
|
+
|
|
98
|
+
for frame in beat_frames:
|
|
99
|
+
# 将librosa帧转换为样本索引
|
|
100
|
+
sample = librosa.frames_to_samples(frame)
|
|
101
|
+
start = max(0, sample - window_samples // 2)
|
|
102
|
+
end = min(len(y), sample + window_samples // 2)
|
|
103
|
+
|
|
104
|
+
if end > start:
|
|
105
|
+
# 使用RMS作为能量度量
|
|
106
|
+
segment = y[start:end]
|
|
107
|
+
rms = np.sqrt(np.mean(segment ** 2))
|
|
108
|
+
energies.append(rms)
|
|
109
|
+
else:
|
|
110
|
+
energies.append(0.0)
|
|
111
|
+
|
|
112
|
+
return np.array(energies)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _classify_energy_levels(energies: np.ndarray) -> List[EnergyLevel]:
|
|
116
|
+
"""根据能量分布分类为弱/中/强三档"""
|
|
117
|
+
if len(energies) == 0:
|
|
118
|
+
return []
|
|
119
|
+
|
|
120
|
+
# 使用百分位数划分阈值
|
|
121
|
+
weak_threshold = np.percentile(energies, 33)
|
|
122
|
+
strong_threshold = np.percentile(energies, 66)
|
|
123
|
+
|
|
124
|
+
levels = []
|
|
125
|
+
for energy in energies:
|
|
126
|
+
if energy <= weak_threshold:
|
|
127
|
+
levels.append(EnergyLevel.WEAK)
|
|
128
|
+
elif energy >= strong_threshold:
|
|
129
|
+
levels.append(EnergyLevel.STRONG)
|
|
130
|
+
else:
|
|
131
|
+
levels.append(EnergyLevel.MEDIUM)
|
|
132
|
+
|
|
133
|
+
return levels
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""命令行接口"""
|
|
2
|
+
import json
|
|
3
|
+
import sys
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from .analyzer import analyze
|
|
7
|
+
from .errors import (
|
|
8
|
+
BeatDetectorError,
|
|
9
|
+
FileNotFoundError,
|
|
10
|
+
UnsupportedFormatError,
|
|
11
|
+
AnalysisError,
|
|
12
|
+
FFmpegRequiredError
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@click.command()
|
|
17
|
+
@click.argument('input_file', type=click.Path(exists=False))
|
|
18
|
+
@click.option('-o', '--output', type=click.Path(), help='Output file path')
|
|
19
|
+
@click.option('--fps', default=30, help='Frame rate (default: 30)')
|
|
20
|
+
@click.option('--log-level', default='info',
|
|
21
|
+
type=click.Choice(['debug', 'info', 'warning', 'error']),
|
|
22
|
+
help='Log level (default: info)')
|
|
23
|
+
@click.option('--pretty', is_flag=True, help='Pretty print JSON')
|
|
24
|
+
def main(input_file: str, output: str, fps: int, log_level: str, pretty: bool) -> None:
|
|
25
|
+
"""
|
|
26
|
+
Music beat detector CLI
|
|
27
|
+
|
|
28
|
+
Analyze audio file and output beat detection results in JSON format.
|
|
29
|
+
|
|
30
|
+
Supported formats (no extra dependencies):
|
|
31
|
+
.wav, .flac, .ogg
|
|
32
|
+
|
|
33
|
+
Formats requiring ffmpeg:
|
|
34
|
+
.mp3, .m4a, .aac, .wma
|
|
35
|
+
|
|
36
|
+
Usage:
|
|
37
|
+
beat-detector input.mp3 -o output.json
|
|
38
|
+
beat-detector input.wav --pretty
|
|
39
|
+
"""
|
|
40
|
+
try:
|
|
41
|
+
result = analyze(input_file, fps=fps, log_level=log_level)
|
|
42
|
+
json_output = result.to_json(pretty=pretty)
|
|
43
|
+
|
|
44
|
+
if output:
|
|
45
|
+
result.save(output, pretty=pretty)
|
|
46
|
+
click.echo(f"Output saved to: {output}")
|
|
47
|
+
else:
|
|
48
|
+
click.echo(json_output)
|
|
49
|
+
|
|
50
|
+
sys.exit(0)
|
|
51
|
+
|
|
52
|
+
except FileNotFoundError as e:
|
|
53
|
+
click.echo(f"Error: {e.message}", err=True)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
except UnsupportedFormatError as e:
|
|
57
|
+
click.echo(f"Error: {e.message}", err=True)
|
|
58
|
+
sys.exit(2)
|
|
59
|
+
|
|
60
|
+
except FFmpegRequiredError as e:
|
|
61
|
+
click.echo(f"Error: {e.message}", err=True)
|
|
62
|
+
sys.exit(4)
|
|
63
|
+
|
|
64
|
+
except AnalysisError as e:
|
|
65
|
+
click.echo(f"Error: {e.message}", err=True)
|
|
66
|
+
sys.exit(3)
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
click.echo(f"Unexpected error: {str(e)}", err=True)
|
|
70
|
+
sys.exit(3)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
if __name__ == '__main__':
|
|
74
|
+
main()
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""错误类型"""
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BeatDetectorError(Exception):
|
|
6
|
+
"""基础异常类"""
|
|
7
|
+
|
|
8
|
+
def __init__(self, message: str, code: int = 1):
|
|
9
|
+
self.message = message
|
|
10
|
+
self.code = code
|
|
11
|
+
super().__init__(self.message)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FileNotFoundError(BeatDetectorError):
|
|
15
|
+
"""文件不存在"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, path: str):
|
|
18
|
+
super().__init__(f"File not found: {path}", code=1)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class UnsupportedFormatError(BeatDetectorError):
|
|
22
|
+
"""不支持的音频格式"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, format: str):
|
|
25
|
+
super().__init__(f"Unsupported audio format: {format}", code=2)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FFmpegRequiredError(BeatDetectorError):
|
|
29
|
+
"""需要安装 ffmpeg"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, format: str):
|
|
32
|
+
message = (
|
|
33
|
+
f"Format '{format}' requires ffmpeg. "
|
|
34
|
+
f"Please install ffmpeg:\n"
|
|
35
|
+
f" Ubuntu/Debian: sudo apt install ffmpeg\n"
|
|
36
|
+
f" macOS: brew install ffmpeg\n"
|
|
37
|
+
f" Windows: choco install ffmpeg\n"
|
|
38
|
+
f"\n"
|
|
39
|
+
f"Or use native formats: .wav, .flac, .ogg"
|
|
40
|
+
)
|
|
41
|
+
super().__init__(message, code=4)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class AnalysisError(BeatDetectorError):
|
|
45
|
+
"""分析失败"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, message: str, cause: Optional[Exception] = None):
|
|
48
|
+
self.cause = cause
|
|
49
|
+
super().__init__(message, code=3)
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""结构检测模块"""
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import List, Callable, Optional
|
|
4
|
+
import numpy as np
|
|
5
|
+
import librosa
|
|
6
|
+
|
|
7
|
+
from .utils import ms_to_frame
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Segment:
|
|
12
|
+
"""音乐段落"""
|
|
13
|
+
type: str
|
|
14
|
+
start_ms: int
|
|
15
|
+
end_ms: int
|
|
16
|
+
start_frame: int
|
|
17
|
+
end_frame: int
|
|
18
|
+
confidence: float
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class EnergyPeak:
|
|
23
|
+
"""能量高峰"""
|
|
24
|
+
time_ms: int
|
|
25
|
+
frame: int
|
|
26
|
+
intensity: float
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SilenceRegion:
|
|
31
|
+
"""静音区域"""
|
|
32
|
+
start_ms: int
|
|
33
|
+
end_ms: int
|
|
34
|
+
start_frame: int
|
|
35
|
+
end_frame: int
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class StructureResult:
|
|
40
|
+
"""结构检测结果"""
|
|
41
|
+
segments: List[Segment]
|
|
42
|
+
energy_peaks: List[EnergyPeak]
|
|
43
|
+
silence_regions: List[SilenceRegion]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def detect_structure(
|
|
47
|
+
y: np.ndarray,
|
|
48
|
+
sr: int,
|
|
49
|
+
fps: int = 30,
|
|
50
|
+
on_progress: Optional[Callable[[float], None]] = None
|
|
51
|
+
) -> StructureResult:
|
|
52
|
+
"""检测音乐结构"""
|
|
53
|
+
|
|
54
|
+
if on_progress:
|
|
55
|
+
on_progress(10.0)
|
|
56
|
+
|
|
57
|
+
# 计算能量
|
|
58
|
+
rms = librosa.feature.rms(y=y)[0]
|
|
59
|
+
rms_frames = librosa.frames_to_time(range(len(rms)), sr=sr)
|
|
60
|
+
|
|
61
|
+
if on_progress:
|
|
62
|
+
on_progress(30.0)
|
|
63
|
+
|
|
64
|
+
# 检测能量高峰
|
|
65
|
+
energy_peaks = _detect_energy_peaks(rms, rms_frames, fps)
|
|
66
|
+
|
|
67
|
+
if on_progress:
|
|
68
|
+
on_progress(50.0)
|
|
69
|
+
|
|
70
|
+
# 检测静音区域
|
|
71
|
+
silence_regions = _detect_silence_regions(rms, rms_frames, fps, sr)
|
|
72
|
+
|
|
73
|
+
if on_progress:
|
|
74
|
+
on_progress(70.0)
|
|
75
|
+
|
|
76
|
+
# 简单的段落检测(基于能量变化)
|
|
77
|
+
segments = _detect_segments(y, sr, rms, rms_frames, fps)
|
|
78
|
+
|
|
79
|
+
if on_progress:
|
|
80
|
+
on_progress(100.0)
|
|
81
|
+
|
|
82
|
+
return StructureResult(
|
|
83
|
+
segments=segments,
|
|
84
|
+
energy_peaks=energy_peaks,
|
|
85
|
+
silence_regions=silence_regions
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _detect_energy_peaks(rms: np.ndarray, times: np.ndarray, fps: int) -> List[EnergyPeak]:
|
|
90
|
+
"""检测能量高峰"""
|
|
91
|
+
peaks = []
|
|
92
|
+
threshold = np.mean(rms) + np.std(rms)
|
|
93
|
+
|
|
94
|
+
for i in range(1, len(rms) - 1):
|
|
95
|
+
if rms[i] > threshold and rms[i] > rms[i-1] and rms[i] > rms[i+1]:
|
|
96
|
+
time_ms = int(times[i] * 1000)
|
|
97
|
+
frame = ms_to_frame(time_ms, fps)
|
|
98
|
+
intensity = float(rms[i] / np.max(rms)) if np.max(rms) > 0 else 0.0
|
|
99
|
+
peaks.append(EnergyPeak(
|
|
100
|
+
time_ms=time_ms,
|
|
101
|
+
frame=frame,
|
|
102
|
+
intensity=intensity
|
|
103
|
+
))
|
|
104
|
+
|
|
105
|
+
return peaks
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _detect_silence_regions(rms: np.ndarray, times: np.ndarray, fps: int, sr: int) -> List[SilenceRegion]:
|
|
109
|
+
"""检测静音区域"""
|
|
110
|
+
regions = []
|
|
111
|
+
threshold = np.mean(rms) * 0.1 # 静音阈值
|
|
112
|
+
min_silence_duration = 0.5 # 最小静音时长(秒)
|
|
113
|
+
|
|
114
|
+
in_silence = False
|
|
115
|
+
silence_start = 0
|
|
116
|
+
|
|
117
|
+
for i, energy in enumerate(rms):
|
|
118
|
+
if energy < threshold:
|
|
119
|
+
if not in_silence:
|
|
120
|
+
in_silence = True
|
|
121
|
+
silence_start = i
|
|
122
|
+
else:
|
|
123
|
+
if in_silence:
|
|
124
|
+
silence_duration = times[i] - times[silence_start]
|
|
125
|
+
if silence_duration >= min_silence_duration:
|
|
126
|
+
start_ms = int(times[silence_start] * 1000)
|
|
127
|
+
end_ms = int(times[i] * 1000)
|
|
128
|
+
regions.append(SilenceRegion(
|
|
129
|
+
start_ms=start_ms,
|
|
130
|
+
end_ms=end_ms,
|
|
131
|
+
start_frame=ms_to_frame(start_ms, fps),
|
|
132
|
+
end_frame=ms_to_frame(end_ms, fps)
|
|
133
|
+
))
|
|
134
|
+
in_silence = False
|
|
135
|
+
|
|
136
|
+
return regions
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _detect_segments(y: np.ndarray, sr: int, rms: np.ndarray, times: np.ndarray, fps: int) -> List[Segment]:
|
|
140
|
+
"""简单的段落检测"""
|
|
141
|
+
segments = []
|
|
142
|
+
duration_ms = int(len(y) / sr * 1000)
|
|
143
|
+
|
|
144
|
+
# 简单策略:将音频分成 intro, main, outro
|
|
145
|
+
num_sections = 3
|
|
146
|
+
section_duration = duration_ms // num_sections
|
|
147
|
+
|
|
148
|
+
# 计算每个部分的平均能量
|
|
149
|
+
rms_per_section = np.array_split(rms, num_sections)
|
|
150
|
+
avg_energies = [np.mean(s) for s in rms_per_section]
|
|
151
|
+
|
|
152
|
+
section_types = ['intro', 'verse', 'outro']
|
|
153
|
+
|
|
154
|
+
for i in range(num_sections):
|
|
155
|
+
start_ms = i * section_duration
|
|
156
|
+
end_ms = (i + 1) * section_duration if i < num_sections - 1 else duration_ms
|
|
157
|
+
|
|
158
|
+
# 根据相对能量计算置信度
|
|
159
|
+
max_energy = max(avg_energies) if max(avg_energies) > 0 else 1
|
|
160
|
+
confidence = float(avg_energies[i] / max_energy)
|
|
161
|
+
|
|
162
|
+
segments.append(Segment(
|
|
163
|
+
type=section_types[i],
|
|
164
|
+
start_ms=start_ms,
|
|
165
|
+
end_ms=end_ms,
|
|
166
|
+
start_frame=ms_to_frame(start_ms, fps),
|
|
167
|
+
end_frame=ms_to_frame(end_ms, fps),
|
|
168
|
+
confidence=confidence
|
|
169
|
+
))
|
|
170
|
+
|
|
171
|
+
return segments
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""工具函数"""
|
|
2
|
+
from typing import Tuple
|
|
3
|
+
import os
|
|
4
|
+
import logging
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from .errors import FileNotFoundError, UnsupportedFormatError, FFmpegRequiredError
|
|
8
|
+
|
|
9
|
+
# 原生支持的格式(无需额外依赖)
|
|
10
|
+
NATIVE_FORMATS = {'.wav', '.flac', '.ogg'}
|
|
11
|
+
|
|
12
|
+
# 需要 ffmpeg 的格式
|
|
13
|
+
FFMPEG_FORMATS = {'.mp3', '.m4a', '.aac', '.wma'}
|
|
14
|
+
|
|
15
|
+
# 所有支持的格式
|
|
16
|
+
SUPPORTED_FORMATS = NATIVE_FORMATS | FFMPEG_FORMATS
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def ms_to_frame(ms: int, fps: int) -> int:
|
|
20
|
+
"""毫秒转帧号"""
|
|
21
|
+
return int(ms * fps / 1000)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def frame_to_ms(frame: int, fps: int) -> int:
|
|
25
|
+
"""帧号转毫秒"""
|
|
26
|
+
return int(frame * 1000 / fps)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def check_ffmpeg_available() -> bool:
|
|
30
|
+
"""检查 ffmpeg 是否可用"""
|
|
31
|
+
import shutil
|
|
32
|
+
return shutil.which('ffmpeg') is not None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def load_audio(file_path: str) -> Tuple[np.ndarray, int]:
|
|
36
|
+
"""加载音频文件
|
|
37
|
+
|
|
38
|
+
原生支持: .wav, .flac, .ogg (无需额外依赖)
|
|
39
|
+
需要 ffmpeg: .mp3, .m4a, .aac, .wma
|
|
40
|
+
"""
|
|
41
|
+
import librosa
|
|
42
|
+
|
|
43
|
+
# 检查文件是否存在
|
|
44
|
+
if not os.path.exists(file_path):
|
|
45
|
+
raise FileNotFoundError(file_path)
|
|
46
|
+
|
|
47
|
+
# 检查格式
|
|
48
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
49
|
+
if ext not in SUPPORTED_FORMATS:
|
|
50
|
+
raise UnsupportedFormatError(ext)
|
|
51
|
+
|
|
52
|
+
# 检查是否需要 ffmpeg
|
|
53
|
+
if ext in FFMPEG_FORMATS and not check_ffmpeg_available():
|
|
54
|
+
raise FFmpegRequiredError(ext)
|
|
55
|
+
|
|
56
|
+
# 加载音频
|
|
57
|
+
y, sr = librosa.load(file_path, sr=None)
|
|
58
|
+
return y, sr
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def setup_logging(level: str = "info") -> None:
|
|
62
|
+
"""配置日志"""
|
|
63
|
+
level_map = {
|
|
64
|
+
"debug": logging.DEBUG,
|
|
65
|
+
"info": logging.INFO,
|
|
66
|
+
"warning": logging.WARNING,
|
|
67
|
+
"error": logging.ERROR,
|
|
68
|
+
}
|
|
69
|
+
logging.basicConfig(
|
|
70
|
+
level=level_map.get(level, logging.INFO),
|
|
71
|
+
format="[%(levelname)s] %(message)s",
|
|
72
|
+
force=True
|
|
73
|
+
)
|