media-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cli/clip.py +79 -0
  2. cli/faces.py +91 -0
  3. cli/metadata.py +68 -0
  4. cli/motion.py +77 -0
  5. cli/objects.py +94 -0
  6. cli/ocr.py +93 -0
  7. cli/scenes.py +57 -0
  8. cli/telemetry.py +65 -0
  9. cli/transcript.py +76 -0
  10. media_engine/__init__.py +7 -0
  11. media_engine/_version.py +34 -0
  12. media_engine/app.py +80 -0
  13. media_engine/batch/__init__.py +56 -0
  14. media_engine/batch/models.py +99 -0
  15. media_engine/batch/processor.py +1131 -0
  16. media_engine/batch/queue.py +232 -0
  17. media_engine/batch/state.py +30 -0
  18. media_engine/batch/timing.py +321 -0
  19. media_engine/cli.py +17 -0
  20. media_engine/config.py +674 -0
  21. media_engine/extractors/__init__.py +75 -0
  22. media_engine/extractors/clip.py +401 -0
  23. media_engine/extractors/faces.py +459 -0
  24. media_engine/extractors/frame_buffer.py +351 -0
  25. media_engine/extractors/frames.py +402 -0
  26. media_engine/extractors/metadata/__init__.py +127 -0
  27. media_engine/extractors/metadata/apple.py +169 -0
  28. media_engine/extractors/metadata/arri.py +118 -0
  29. media_engine/extractors/metadata/avchd.py +208 -0
  30. media_engine/extractors/metadata/avchd_gps.py +270 -0
  31. media_engine/extractors/metadata/base.py +688 -0
  32. media_engine/extractors/metadata/blackmagic.py +139 -0
  33. media_engine/extractors/metadata/camera_360.py +276 -0
  34. media_engine/extractors/metadata/canon.py +290 -0
  35. media_engine/extractors/metadata/dji.py +371 -0
  36. media_engine/extractors/metadata/dv.py +121 -0
  37. media_engine/extractors/metadata/ffmpeg.py +76 -0
  38. media_engine/extractors/metadata/generic.py +119 -0
  39. media_engine/extractors/metadata/gopro.py +256 -0
  40. media_engine/extractors/metadata/red.py +305 -0
  41. media_engine/extractors/metadata/registry.py +114 -0
  42. media_engine/extractors/metadata/sony.py +442 -0
  43. media_engine/extractors/metadata/tesla.py +157 -0
  44. media_engine/extractors/motion.py +765 -0
  45. media_engine/extractors/objects.py +245 -0
  46. media_engine/extractors/objects_qwen.py +754 -0
  47. media_engine/extractors/ocr.py +268 -0
  48. media_engine/extractors/scenes.py +82 -0
  49. media_engine/extractors/shot_type.py +217 -0
  50. media_engine/extractors/telemetry.py +262 -0
  51. media_engine/extractors/transcribe.py +579 -0
  52. media_engine/extractors/translate.py +121 -0
  53. media_engine/extractors/vad.py +263 -0
  54. media_engine/main.py +68 -0
  55. media_engine/py.typed +0 -0
  56. media_engine/routers/__init__.py +15 -0
  57. media_engine/routers/batch.py +78 -0
  58. media_engine/routers/health.py +93 -0
  59. media_engine/routers/models.py +211 -0
  60. media_engine/routers/settings.py +87 -0
  61. media_engine/routers/utils.py +135 -0
  62. media_engine/schemas.py +581 -0
  63. media_engine/utils/__init__.py +5 -0
  64. media_engine/utils/logging.py +54 -0
  65. media_engine/utils/memory.py +49 -0
  66. media_engine-0.1.0.dist-info/METADATA +276 -0
  67. media_engine-0.1.0.dist-info/RECORD +70 -0
  68. media_engine-0.1.0.dist-info/WHEEL +4 -0
  69. media_engine-0.1.0.dist-info/entry_points.txt +11 -0
  70. media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,256 @@
1
+ """GoPro metadata extractor.
2
+
3
+ Handles GoPro cameras:
4
+ - HERO5, HERO6, HERO7, HERO8, HERO9, HERO10, HERO11, HERO12, HERO13
5
+ - MAX (360 camera)
6
+ - Session series
7
+
8
+ Detection methods:
9
+ - handler_name containing "GoPro"
10
+ - firmware tag pattern (e.g., "HD7.01.01.90.00" for HERO7)
11
+ - Filename patterns (GH*, GX*, GOPR*)
12
+
13
+ GoPro files contain:
14
+ - gpmd stream: GPS, accelerometer, gyroscope data
15
+ - Timecode
16
+ - Color space (usually BT.709)
17
+ """
18
+
19
+ import logging
20
+ import struct
21
+ import subprocess
22
+ from pathlib import Path
23
+ from typing import Any
24
+
25
+ from media_engine.schemas import (
26
+ GPS,
27
+ DetectionMethod,
28
+ DeviceInfo,
29
+ GPSTrack,
30
+ GPSTrackPoint,
31
+ MediaDeviceType,
32
+ Metadata,
33
+ )
34
+
35
+ from .registry import get_tags_lower, register_extractor
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # GoPro model mapping from firmware prefix
40
+ GOPRO_MODELS = {
41
+ "HD5": "HERO5 Black",
42
+ "HD6": "HERO6 Black",
43
+ "HD7": "HERO7 Black",
44
+ "HD8": "HERO8 Black",
45
+ "HD9": "HERO9 Black",
46
+ "H10": "HERO10 Black",
47
+ "H11": "HERO11 Black",
48
+ "H12": "HERO12 Black",
49
+ "H13": "HERO13 Black",
50
+ "H21": "HERO Session",
51
+ "H22": "HERO5 Session",
52
+ "HX": "MAX",
53
+ "H19": "MAX", # Another MAX identifier
54
+ "FS": "Fusion",
55
+ }
56
+
57
+
58
+ def _parse_firmware_model(firmware: str) -> str | None:
59
+ """Parse GoPro model from firmware string.
60
+
61
+ Examples:
62
+ - "HD7.01.01.90.00" -> HERO7 Black
63
+ - "H10.01.01.40.00" -> HERO10 Black
64
+ """
65
+ if not firmware:
66
+ return None
67
+
68
+ # Try direct prefix match
69
+ for prefix, model in GOPRO_MODELS.items():
70
+ if firmware.upper().startswith(prefix):
71
+ return model
72
+
73
+ return None
74
+
75
+
76
+ def _extract_gpmd_gps(file_path: str) -> tuple[GPS | None, GPSTrack | None]:
77
+ """Extract GPS from GoPro GPMD stream.
78
+
79
+ GoPro stores telemetry in a binary stream with FourCC tags.
80
+ GPS data is under DEVC -> STRM -> GPS5 (lat, lon, alt, speed2d, speed3d).
81
+
82
+ Returns tuple of (first GPS point, full GPS track).
83
+ """
84
+ try:
85
+ # Extract gpmd stream using ffmpeg
86
+ cmd = [
87
+ "ffmpeg",
88
+ "-y",
89
+ "-i",
90
+ file_path,
91
+ "-codec",
92
+ "copy",
93
+ "-map",
94
+ "0:d:0", # First data stream (gpmd)
95
+ "-f",
96
+ "rawvideo",
97
+ "pipe:1",
98
+ ]
99
+
100
+ result = subprocess.run(cmd, capture_output=True, timeout=30, check=False)
101
+
102
+ if result.returncode != 0 or not result.stdout:
103
+ return None, None
104
+
105
+ data = result.stdout
106
+ gps_points: list[GPSTrackPoint] = []
107
+
108
+ # Parse GPMD binary format
109
+ # Looking for GPS5 tag which contains: lat, lon, alt, speed2d, speed3d
110
+ # Each value is a signed 32-bit int, scaled by SCAL value
111
+
112
+ i = 0
113
+ current_scale = 1.0
114
+
115
+ while i < len(data) - 8:
116
+ # Read FourCC tag
117
+ tag = data[i : i + 4]
118
+ if len(tag) < 4:
119
+ break
120
+
121
+ # Check for SCAL (scale factor)
122
+ if tag == b"SCAL":
123
+ type_byte = data[i + 4] if i + 4 < len(data) else 0
124
+ size = data[i + 5] if i + 5 < len(data) else 0
125
+ count = (data[i + 6] << 8 | data[i + 7]) if i + 7 < len(data) else 0
126
+
127
+ if type_byte == ord("l") and size == 4 and count >= 1:
128
+ # 32-bit signed int scale
129
+ scale_offset = i + 8
130
+ if scale_offset + 4 <= len(data):
131
+ current_scale = struct.unpack(">i", data[scale_offset : scale_offset + 4])[0]
132
+
133
+ # Check for GPS5 (GPS data)
134
+ elif tag == b"GPS5":
135
+ type_byte = data[i + 4] if i + 4 < len(data) else 0
136
+ size = data[i + 5] if i + 5 < len(data) else 0
137
+ count = (data[i + 6] << 8 | data[i + 7]) if i + 7 < len(data) else 0
138
+
139
+ if type_byte == ord("l") and size == 20: # 5 x 4-byte ints
140
+ gps_offset = i + 8
141
+ for j in range(count):
142
+ sample_offset = gps_offset + j * 20
143
+ if sample_offset + 20 <= len(data):
144
+ values = struct.unpack(">iiiii", data[sample_offset : sample_offset + 20])
145
+ lat = values[0] / current_scale
146
+ lon = values[1] / current_scale
147
+ alt = values[2] / current_scale
148
+
149
+ # Validate coordinates
150
+ if -90 <= lat <= 90 and -180 <= lon <= 180 and lat != 0:
151
+ point = GPSTrackPoint(
152
+ latitude=round(lat, 6),
153
+ longitude=round(lon, 6),
154
+ altitude=round(alt, 1) if alt != 0 else None,
155
+ )
156
+ # Dedupe consecutive identical points
157
+ if not gps_points or (point.latitude != gps_points[-1].latitude or point.longitude != gps_points[-1].longitude):
158
+ gps_points.append(point)
159
+
160
+ i += 1
161
+
162
+ if gps_points:
163
+ first_gps = GPS(
164
+ latitude=gps_points[0].latitude,
165
+ longitude=gps_points[0].longitude,
166
+ altitude=gps_points[0].altitude,
167
+ )
168
+ track = GPSTrack(points=gps_points, source="gpmd") if len(gps_points) > 1 else None
169
+ logger.info(f"Extracted {len(gps_points)} GPS points from GoPro GPMD")
170
+ return first_gps, track
171
+
172
+ return None, None
173
+
174
+ except Exception as e:
175
+ logger.debug(f"Failed to extract GPS from GPMD: {e}")
176
+ return None, None
177
+
178
+
179
+ class GoProExtractor:
180
+ """Metadata extractor for GoPro cameras."""
181
+
182
+ def detect(self, probe_data: dict[str, Any], file_path: str) -> bool:
183
+ """Detect if file is from a GoPro camera."""
184
+ path = Path(file_path)
185
+ tags = get_tags_lower(probe_data)
186
+
187
+ # Check firmware tag
188
+ firmware = tags.get("firmware", "")
189
+ if firmware and any(firmware.upper().startswith(prefix) for prefix in GOPRO_MODELS):
190
+ return True
191
+
192
+ # Check handler_name for "GoPro"
193
+ for stream in probe_data.get("streams", []):
194
+ handler = stream.get("tags", {}).get("handler_name", "")
195
+ if "GoPro" in handler:
196
+ return True
197
+
198
+ # Check encoder tag
199
+ for stream in probe_data.get("streams", []):
200
+ encoder = stream.get("tags", {}).get("encoder", "")
201
+ if "GoPro" in encoder:
202
+ return True
203
+
204
+ # Check filename pattern (GH*, GX*, GOPR*)
205
+ name = path.stem.upper()
206
+ if name.startswith(("GH", "GX", "GOPR")):
207
+ return True
208
+
209
+ return False
210
+
211
+ def extract(
212
+ self,
213
+ probe_data: dict[str, Any],
214
+ file_path: str,
215
+ base_metadata: Metadata,
216
+ ) -> Metadata:
217
+ """Extract GoPro-specific metadata."""
218
+ tags = get_tags_lower(probe_data)
219
+
220
+ # Get firmware and parse model
221
+ firmware = tags.get("firmware", "")
222
+ model = _parse_firmware_model(firmware)
223
+
224
+ # Determine device type (MAX is 360 camera)
225
+ device_type = MediaDeviceType.ACTION_CAMERA
226
+ if model and "MAX" in model:
227
+ device_type = MediaDeviceType.CAMERA_360
228
+
229
+ device = DeviceInfo(
230
+ make="GoPro",
231
+ model=model,
232
+ software=firmware if firmware else None,
233
+ type=device_type,
234
+ detection_method=DetectionMethod.METADATA,
235
+ confidence=1.0,
236
+ )
237
+
238
+ # Extract GPS from GPMD stream
239
+ gps, gps_track = _extract_gpmd_gps(file_path)
240
+
241
+ # Use extracted GPS or keep base
242
+ if gps is None:
243
+ gps = base_metadata.gps
244
+ if gps_track is None:
245
+ gps_track = base_metadata.gps_track
246
+
247
+ # Update metadata
248
+ base_metadata.device = device
249
+ base_metadata.gps = gps
250
+ base_metadata.gps_track = gps_track
251
+
252
+ return base_metadata
253
+
254
+
255
+ # Register the extractor
256
+ register_extractor("gopro", GoProExtractor())
@@ -0,0 +1,305 @@
1
+ """RED Digital Cinema metadata extractor.
2
+
3
+ Handles RED cameras:
4
+ - RED ONE (2007-2013)
5
+ - SCARLET (2011-2016)
6
+ - EPIC (2010-2016)
7
+ - DRAGON/Weapon (2014-2018)
8
+ - Helium (2016-present)
9
+ - KOMODO (2020-present)
10
+ - V-Raptor (2021-present)
11
+
12
+ R3D files store metadata in a proprietary header format.
13
+ ffprobe CANNOT read R3D natively, so we parse the header directly.
14
+
15
+ Header structure (reverse-engineered):
16
+ - 0x00-0x03: Size
17
+ - 0x04-0x07: Magic "RED2"
18
+ - 0x08+: TLV-like blocks with type codes
19
+
20
+ Notable fields found in header:
21
+ - Timecode (format HH:MM:SS:FF)
22
+ - Date (format YYYYMMDD)
23
+ - Firmware version
24
+ - Camera model (SCARLET, EPIC, KOMODO, etc.)
25
+ - Serial number
26
+ - Lens info (make, model, focal length, aperture)
27
+ """
28
+
29
+ import logging
30
+ import re
31
+ from pathlib import Path
32
+ from typing import Any
33
+
34
+ from media_engine.schemas import (
35
+ Codec,
36
+ ColorSpace,
37
+ DetectionMethod,
38
+ DeviceInfo,
39
+ LensInfo,
40
+ MediaDeviceType,
41
+ Metadata,
42
+ VideoCodec,
43
+ )
44
+
45
+ from .registry import register_extractor
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ # RED camera models by sensor/body
50
+ RED_MODELS = {
51
+ "dragon": "RED Dragon",
52
+ "helium": "RED Helium",
53
+ "gemini": "RED Gemini",
54
+ "monstro": "RED Monstro",
55
+ "komodo": "KOMODO",
56
+ "raptor": "V-RAPTOR",
57
+ "ranger": "RANGER",
58
+ "weapon": "WEAPON",
59
+ "epic": "EPIC",
60
+ "scarlet": "SCARLET",
61
+ "raven": "RAVEN",
62
+ "red one": "RED ONE",
63
+ "dsmc2": "DSMC2",
64
+ "dsmc3": "DSMC3",
65
+ }
66
+
67
+
68
+ def _parse_r3d_header(file_path: str) -> dict[str, Any] | None:
69
+ """Parse R3D file header for metadata.
70
+
71
+ Returns dict with extracted metadata or None if not a valid R3D.
72
+ """
73
+ try:
74
+ with open(file_path, "rb") as f:
75
+ # Read header (first 1KB should contain all metadata)
76
+ header = f.read(1024)
77
+
78
+ if len(header) < 8:
79
+ return None
80
+
81
+ # Check magic
82
+ magic = header[4:8]
83
+ if magic != b"RED2":
84
+ return None
85
+
86
+ result: dict[str, Any] = {"make": "RED"}
87
+
88
+ # Find timecode (format like "01:00:26:06")
89
+ tc_match = re.search(rb"\d{2}:\d{2}:\d{2}:\d{2}", header)
90
+ if tc_match:
91
+ tc_str = tc_match.group().decode("ascii")
92
+ result["timecode"] = tc_str
93
+
94
+ # Find date (format YYYYMMDD)
95
+ date_match = re.search(rb"20\d{6}", header)
96
+ if date_match:
97
+ date_str = date_match.group().decode("ascii")
98
+ # Format as YYYY-MM-DD
99
+ result["date"] = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
100
+
101
+ # Find firmware version (X.X.XX pattern)
102
+ fw_match = re.search(rb"\d+\.\d+\.\d+", header)
103
+ if fw_match:
104
+ result["firmware"] = fw_match.group().decode("ascii")
105
+
106
+ # Find camera model - look for known RED models
107
+ models = [
108
+ b"SCARLET",
109
+ b"EPIC",
110
+ b"DRAGON",
111
+ b"WEAPON",
112
+ b"HELIUM",
113
+ b"KOMODO",
114
+ b"RAPTOR",
115
+ b"RED ONE",
116
+ b"GEMINI",
117
+ b"MONSTRO",
118
+ b"RANGER",
119
+ b"RAVEN",
120
+ b"DSMC2",
121
+ b"DSMC3",
122
+ ]
123
+ for model in models:
124
+ if model in header.upper():
125
+ result["model"] = model.decode("ascii")
126
+ break
127
+
128
+ # Find serial number (pattern like 221LS102VTLCZ)
129
+ serial_match = re.search(rb"\d{3}[A-Z]{2}\d{3}[A-Z0-9]+", header)
130
+ if serial_match:
131
+ result["serial"] = serial_match.group().decode("ascii")
132
+
133
+ # Find lens info (look for known lens brands/patterns)
134
+ lens_patterns = [
135
+ rb"Canon [^\x00]+",
136
+ rb"Zeiss [^\x00]+",
137
+ rb"Leica [^\x00]+",
138
+ rb"Sigma [^\x00]+",
139
+ rb"Cooke [^\x00]+",
140
+ rb"Angenieux [^\x00]+",
141
+ rb"Fujinon [^\x00]+",
142
+ rb"RED [^\x00]*PRO [^\x00]*",
143
+ ]
144
+ for pattern in lens_patterns:
145
+ lens_match = re.search(pattern, header)
146
+ if lens_match:
147
+ lens_str = lens_match.group().decode("utf-8", errors="ignore")
148
+ # Clean up null bytes and control chars
149
+ lens_str = re.sub(r"[\x00-\x1f]", "", lens_str).strip()
150
+ if len(lens_str) > 3:
151
+ result["lens_name"] = lens_str
152
+ break
153
+
154
+ # Find lens serial (pattern like 0018-0046-00E6)
155
+ lens_serial_match = re.search(rb"\d{4}-\d{4}-\d{4}", header)
156
+ if lens_serial_match:
157
+ result["lens_serial"] = lens_serial_match.group().decode("ascii")
158
+
159
+ return result if len(result) > 1 else None
160
+
161
+ except Exception as e:
162
+ logger.warning(f"Failed to parse R3D header: {e}")
163
+ return None
164
+
165
+
166
+ class RedExtractor:
167
+ """Extract metadata from RED cameras."""
168
+
169
+ def detect(self, probe_data: dict[str, Any], file_path: str) -> bool:
170
+ """Detect if this is a RED R3D file.
171
+
172
+ Detection methods:
173
+ 1. File extension (.R3D)
174
+ 2. RED folder structure (RDM/RDC)
175
+ 3. R3D magic bytes
176
+ """
177
+ path = Path(file_path)
178
+
179
+ # Check file extension
180
+ if path.suffix.upper() == ".R3D":
181
+ return True
182
+
183
+ # Check folder structure (RDM = RED Digital Magazine, RDC = RED Digital Clip)
184
+ parts = path.parts
185
+ for part in parts:
186
+ if part.upper().endswith(".RDM") or part.upper().endswith(".RDC"):
187
+ return True
188
+
189
+ return False
190
+
191
+ def extract(
192
+ self,
193
+ probe_data: dict[str, Any],
194
+ file_path: str,
195
+ base_metadata: Metadata,
196
+ ) -> Metadata:
197
+ """Extract RED-specific metadata from R3D file."""
198
+ path = Path(file_path)
199
+
200
+ # Parse R3D header directly (ffprobe cannot read R3D)
201
+ r3d_data = _parse_r3d_header(file_path)
202
+
203
+ if r3d_data is None:
204
+ # Return minimal metadata
205
+ device = DeviceInfo(
206
+ make="RED",
207
+ model=None,
208
+ type=MediaDeviceType.CINEMA_CAMERA,
209
+ detection_method=DetectionMethod.METADATA,
210
+ confidence=0.8,
211
+ )
212
+ base_metadata.device = device
213
+ return base_metadata
214
+
215
+ # Build device info
216
+ # Note: serial_number stored in software field as DeviceInfo doesn't have serial
217
+ serial = r3d_data.get("serial")
218
+ firmware = r3d_data.get("firmware")
219
+ software_str = firmware
220
+ if serial:
221
+ software_str = f"{firmware} (S/N: {serial})" if firmware else f"S/N: {serial}"
222
+
223
+ device = DeviceInfo(
224
+ make="RED",
225
+ model=r3d_data.get("model"),
226
+ software=software_str,
227
+ type=MediaDeviceType.CINEMA_CAMERA,
228
+ detection_method=DetectionMethod.METADATA,
229
+ confidence=1.0,
230
+ )
231
+
232
+ # Build lens info
233
+ lens: LensInfo | None = None
234
+ lens_name = r3d_data.get("lens_name")
235
+ if lens_name:
236
+ # Try to parse focal length from lens name
237
+ focal_match = re.search(r"(\d+)-(\d+)mm|(\d+)mm", lens_name)
238
+ focal_length: float | None = None
239
+ if focal_match:
240
+ if focal_match.group(3):
241
+ focal_length = float(focal_match.group(3))
242
+ elif focal_match.group(1):
243
+ # Zoom lens - use wide end
244
+ focal_length = float(focal_match.group(1))
245
+
246
+ # Try to parse aperture
247
+ aperture_match = re.search(r"f/?([\d.]+)", lens_name)
248
+ aperture: float | None = None
249
+ if aperture_match:
250
+ aperture = float(aperture_match.group(1))
251
+
252
+ # Store lens make/model/serial in iris field as LensInfo lacks those fields
253
+ lens_serial = r3d_data.get("lens_serial")
254
+ iris_info = lens_name
255
+ if lens_serial:
256
+ iris_info = f"{lens_name} (S/N: {lens_serial})"
257
+
258
+ lens = LensInfo(
259
+ focal_length=focal_length,
260
+ aperture=aperture,
261
+ iris=iris_info, # Store full lens info here
262
+ detection_method=DetectionMethod.METADATA,
263
+ )
264
+
265
+ # Get timecode string
266
+ timecode: str | None = r3d_data.get("timecode")
267
+
268
+ # Get file size
269
+ file_size = path.stat().st_size if path.exists() else base_metadata.file_size
270
+
271
+ # R3D uses REDCODE compression
272
+ video_codec = VideoCodec(
273
+ name="REDCODE",
274
+ profile="RAW",
275
+ bit_depth=16, # RED shoots 16-bit
276
+ )
277
+
278
+ # Color space - RED shoots in REDWideGamutRGB / Log3G10
279
+ color_space = ColorSpace(
280
+ primaries="REDWideGamutRGB",
281
+ transfer="Log3G10",
282
+ matrix=None,
283
+ detection_method=DetectionMethod.METADATA,
284
+ )
285
+
286
+ return Metadata(
287
+ duration=base_metadata.duration,
288
+ resolution=base_metadata.resolution,
289
+ codec=Codec(video="REDCODE"),
290
+ video_codec=video_codec,
291
+ audio=base_metadata.audio,
292
+ fps=base_metadata.fps,
293
+ bitrate=base_metadata.bitrate,
294
+ file_size=file_size,
295
+ timecode=timecode,
296
+ created_at=base_metadata.created_at,
297
+ device=device,
298
+ gps=base_metadata.gps,
299
+ color_space=color_space,
300
+ lens=lens,
301
+ )
302
+
303
+
304
+ # Register the extractor
305
+ register_extractor("red", RedExtractor())
@@ -0,0 +1,114 @@
1
+ """Manufacturer detection and extractor registry.
2
+
3
+ This module provides a registry pattern for metadata extractors.
4
+ Each manufacturer module registers itself with detect() and extract() functions.
5
+
6
+ To add a new manufacturer:
7
+ 1. Create a new module (e.g., panasonic.py)
8
+ 2. Implement detect(probe_data, file_path) -> bool
9
+ 3. Implement extract(probe_data, file_path, base_metadata) -> Metadata
10
+ 4. Import the module in __init__.py to trigger registration
11
+ """
12
+
13
+ import logging
14
+ from typing import Any, Protocol
15
+
16
+ from media_engine.schemas import Metadata
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class MetadataExtractor(Protocol):
22
+ """Protocol for manufacturer-specific metadata extractors."""
23
+
24
+ def detect(self, probe_data: dict[str, Any], file_path: str) -> bool:
25
+ """Detect if this extractor handles the given file.
26
+
27
+ Args:
28
+ probe_data: Parsed ffprobe JSON output
29
+ file_path: Path to video file
30
+
31
+ Returns:
32
+ True if this extractor should handle the file
33
+ """
34
+ ...
35
+
36
+ def extract(self, probe_data: dict[str, Any], file_path: str, base_metadata: Metadata) -> Metadata:
37
+ """Extract manufacturer-specific metadata.
38
+
39
+ Args:
40
+ probe_data: Parsed ffprobe JSON output
41
+ file_path: Path to video file
42
+ base_metadata: Base metadata from ffprobe (device-agnostic)
43
+
44
+ Returns:
45
+ Enhanced Metadata with device-specific fields
46
+ """
47
+ ...
48
+
49
+
50
+ # Global registry of extractors
51
+ _extractors: list[tuple[str, MetadataExtractor]] = []
52
+
53
+
54
+ def register_extractor(name: str, extractor: MetadataExtractor) -> None:
55
+ """Register a metadata extractor.
56
+
57
+ Args:
58
+ name: Extractor name (e.g., "dji", "sony", "apple")
59
+ extractor: Extractor instance implementing detect() and extract()
60
+ """
61
+ _extractors.append((name, extractor))
62
+ logger.debug(f"Registered metadata extractor: {name}")
63
+
64
+
65
+ def get_extractor(probe_data: dict[str, Any], file_path: str) -> tuple[str, MetadataExtractor] | None:
66
+ """Find the appropriate extractor for a file.
67
+
68
+ Iterates through registered extractors in order and returns the first
69
+ one whose detect() method returns True.
70
+
71
+ Args:
72
+ probe_data: Parsed ffprobe JSON output
73
+ file_path: Path to video file
74
+
75
+ Returns:
76
+ Tuple of (name, extractor) or None if no match
77
+ """
78
+ for name, extractor in _extractors:
79
+ try:
80
+ if extractor.detect(probe_data, file_path):
81
+ logger.debug(f"Matched extractor: {name}")
82
+ return name, extractor
83
+ except Exception as e:
84
+ logger.warning(f"Extractor {name} detect() failed: {e}")
85
+ continue
86
+
87
+ return None
88
+
89
+
90
+ def list_extractors() -> list[str]:
91
+ """List all registered extractor names."""
92
+ return [name for name, _ in _extractors]
93
+
94
+
95
+ # Helper to get common tag values for detection
96
+ def get_tags_lower(probe_data: dict[str, Any]) -> dict[str, str]:
97
+ """Get format tags with lowercase keys."""
98
+ format_info = probe_data.get("format", {})
99
+ tags = format_info.get("tags", {})
100
+ return {k.lower(): v for k, v in tags.items()}
101
+
102
+
103
+ def get_make_model(probe_data: dict[str, Any]) -> tuple[str | None, str | None]:
104
+ """Extract make and model from common metadata locations.
105
+
106
+ Returns:
107
+ Tuple of (make, model) - either may be None
108
+ """
109
+ tags = get_tags_lower(probe_data)
110
+
111
+ make = tags.get("make") or tags.get("com.apple.quicktime.make") or tags.get("manufacturer")
112
+ model = tags.get("model") or tags.get("com.apple.quicktime.model") or tags.get("model_name")
113
+
114
+ return make, model