media-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cli/clip.py +79 -0
  2. cli/faces.py +91 -0
  3. cli/metadata.py +68 -0
  4. cli/motion.py +77 -0
  5. cli/objects.py +94 -0
  6. cli/ocr.py +93 -0
  7. cli/scenes.py +57 -0
  8. cli/telemetry.py +65 -0
  9. cli/transcript.py +76 -0
  10. media_engine/__init__.py +7 -0
  11. media_engine/_version.py +34 -0
  12. media_engine/app.py +80 -0
  13. media_engine/batch/__init__.py +56 -0
  14. media_engine/batch/models.py +99 -0
  15. media_engine/batch/processor.py +1131 -0
  16. media_engine/batch/queue.py +232 -0
  17. media_engine/batch/state.py +30 -0
  18. media_engine/batch/timing.py +321 -0
  19. media_engine/cli.py +17 -0
  20. media_engine/config.py +674 -0
  21. media_engine/extractors/__init__.py +75 -0
  22. media_engine/extractors/clip.py +401 -0
  23. media_engine/extractors/faces.py +459 -0
  24. media_engine/extractors/frame_buffer.py +351 -0
  25. media_engine/extractors/frames.py +402 -0
  26. media_engine/extractors/metadata/__init__.py +127 -0
  27. media_engine/extractors/metadata/apple.py +169 -0
  28. media_engine/extractors/metadata/arri.py +118 -0
  29. media_engine/extractors/metadata/avchd.py +208 -0
  30. media_engine/extractors/metadata/avchd_gps.py +270 -0
  31. media_engine/extractors/metadata/base.py +688 -0
  32. media_engine/extractors/metadata/blackmagic.py +139 -0
  33. media_engine/extractors/metadata/camera_360.py +276 -0
  34. media_engine/extractors/metadata/canon.py +290 -0
  35. media_engine/extractors/metadata/dji.py +371 -0
  36. media_engine/extractors/metadata/dv.py +121 -0
  37. media_engine/extractors/metadata/ffmpeg.py +76 -0
  38. media_engine/extractors/metadata/generic.py +119 -0
  39. media_engine/extractors/metadata/gopro.py +256 -0
  40. media_engine/extractors/metadata/red.py +305 -0
  41. media_engine/extractors/metadata/registry.py +114 -0
  42. media_engine/extractors/metadata/sony.py +442 -0
  43. media_engine/extractors/metadata/tesla.py +157 -0
  44. media_engine/extractors/motion.py +765 -0
  45. media_engine/extractors/objects.py +245 -0
  46. media_engine/extractors/objects_qwen.py +754 -0
  47. media_engine/extractors/ocr.py +268 -0
  48. media_engine/extractors/scenes.py +82 -0
  49. media_engine/extractors/shot_type.py +217 -0
  50. media_engine/extractors/telemetry.py +262 -0
  51. media_engine/extractors/transcribe.py +579 -0
  52. media_engine/extractors/translate.py +121 -0
  53. media_engine/extractors/vad.py +263 -0
  54. media_engine/main.py +68 -0
  55. media_engine/py.typed +0 -0
  56. media_engine/routers/__init__.py +15 -0
  57. media_engine/routers/batch.py +78 -0
  58. media_engine/routers/health.py +93 -0
  59. media_engine/routers/models.py +211 -0
  60. media_engine/routers/settings.py +87 -0
  61. media_engine/routers/utils.py +135 -0
  62. media_engine/schemas.py +581 -0
  63. media_engine/utils/__init__.py +5 -0
  64. media_engine/utils/logging.py +54 -0
  65. media_engine/utils/memory.py +49 -0
  66. media_engine-0.1.0.dist-info/METADATA +276 -0
  67. media_engine-0.1.0.dist-info/RECORD +70 -0
  68. media_engine-0.1.0.dist-info/WHEEL +4 -0
  69. media_engine-0.1.0.dist-info/entry_points.txt +11 -0
  70. media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,139 @@
1
+ """Blackmagic Design metadata extraction.
2
+
3
+ Handles Blackmagic cameras:
4
+ - Pocket Cinema Camera (4K, 6K, 6K Pro, 6K G2)
5
+ - URSA Mini Pro (4.6K, 12K)
6
+ - Micro Cinema Camera
7
+ - Production Camera 4K
8
+
9
+ Detection methods:
10
+ - .braw extension (Blackmagic RAW)
11
+ - com.apple.proapps.manufacturer: "Blackmagic Design"
12
+ - com.apple.proapps.cameraname: camera model
13
+ - com.apple.proapps.customgamma: LOG profile
14
+
15
+ Note: Full BRAW metadata requires Blackmagic RAW SDK (free download).
16
+ Without it, we detect the format but limited metadata from ffprobe.
17
+ """
18
+
19
+ import logging
20
+ from pathlib import Path
21
+ from typing import Any
22
+
23
+ from media_engine.schemas import (
24
+ ColorSpace,
25
+ DetectionMethod,
26
+ DeviceInfo,
27
+ MediaDeviceType,
28
+ Metadata,
29
+ )
30
+
31
+ from .registry import get_tags_lower, register_extractor
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def _parse_custom_gamma(gamma_string: str) -> str | None:
37
+ """Parse Blackmagic custom gamma string to get LOG profile name.
38
+
39
+ Examples:
40
+ "com.blackmagic-design.productioncamera4k.filmlog" -> "filmlog"
41
+ "com.blackmagic-design.ursa.film" -> "film"
42
+ """
43
+ if not gamma_string:
44
+ return None
45
+
46
+ parts = gamma_string.split(".")
47
+ if parts:
48
+ return parts[-1] # Return the last part as the profile name
49
+
50
+ return None
51
+
52
+
53
+ class BlackmagicExtractor:
54
+ """Metadata extractor for Blackmagic Design cameras."""
55
+
56
+ def detect(self, probe_data: dict[str, Any], file_path: str) -> bool:
57
+ """Detect if file is from a Blackmagic camera."""
58
+ path = Path(file_path)
59
+
60
+ # Check for BRAW extension
61
+ if path.suffix.lower() == ".braw":
62
+ return True
63
+
64
+ tags = get_tags_lower(probe_data)
65
+
66
+ # Check ProApps manufacturer tag
67
+ manufacturer = tags.get("com.apple.proapps.manufacturer", "")
68
+ if "BLACKMAGIC" in manufacturer.upper():
69
+ return True
70
+
71
+ # Check make tag
72
+ make = tags.get("make") or tags.get("manufacturer")
73
+ if make and "BLACKMAGIC" in make.upper():
74
+ return True
75
+
76
+ # Check custom gamma for Blackmagic signature
77
+ custom_gamma = tags.get("com.apple.proapps.customgamma", "")
78
+ if "blackmagic" in custom_gamma.lower():
79
+ return True
80
+
81
+ return False
82
+
83
+ def extract(self, probe_data: dict[str, Any], file_path: str, base_metadata: Metadata) -> Metadata:
84
+ """Extract Blackmagic-specific metadata."""
85
+ path = Path(file_path)
86
+ tags = get_tags_lower(probe_data)
87
+
88
+ # Get device info from ProApps tags (preferred)
89
+ manufacturer = tags.get("com.apple.proapps.manufacturer") or tags.get("make") or "Blackmagic Design"
90
+ camera_name = tags.get("com.apple.proapps.cameraname") or tags.get("model")
91
+
92
+ # BRAW files are from cinema cameras
93
+ is_braw = path.suffix.lower() == ".braw"
94
+ if is_braw:
95
+ logger.info("BRAW detected. For full metadata, install Blackmagic RAW SDK.")
96
+
97
+ device = DeviceInfo(
98
+ make=manufacturer,
99
+ model=camera_name,
100
+ software=tags.get("software"),
101
+ type=MediaDeviceType.CINEMA_CAMERA,
102
+ detection_method=DetectionMethod.METADATA,
103
+ confidence=1.0,
104
+ )
105
+
106
+ # Extract color space from custom gamma
107
+ color_space = base_metadata.color_space
108
+ custom_gamma = tags.get("com.apple.proapps.customgamma", "")
109
+ if custom_gamma:
110
+ profile_name = _parse_custom_gamma(custom_gamma)
111
+ if profile_name:
112
+ base_cs = base_metadata.color_space
113
+ color_space = ColorSpace(
114
+ transfer=profile_name,
115
+ primaries=base_cs.primaries if base_cs else None,
116
+ matrix=base_cs.matrix if base_cs else None,
117
+ detection_method=DetectionMethod.METADATA,
118
+ )
119
+
120
+ return Metadata(
121
+ duration=base_metadata.duration,
122
+ resolution=base_metadata.resolution,
123
+ codec=base_metadata.codec,
124
+ video_codec=base_metadata.video_codec,
125
+ audio=base_metadata.audio,
126
+ fps=base_metadata.fps,
127
+ bitrate=base_metadata.bitrate,
128
+ file_size=base_metadata.file_size,
129
+ timecode=base_metadata.timecode,
130
+ created_at=base_metadata.created_at,
131
+ device=device,
132
+ gps=base_metadata.gps,
133
+ color_space=color_space,
134
+ lens=base_metadata.lens,
135
+ )
136
+
137
+
138
+ # Register this extractor
139
+ register_extractor("blackmagic", BlackmagicExtractor())
@@ -0,0 +1,276 @@
1
+ """Generic 360 camera metadata extractor.
2
+
3
+ Detects 360 cameras from various manufacturers:
4
+ - Insta360 (X3, X4, ONE RS, GO 3, etc.)
5
+ - Kandao QooCam (QooCam 8K, QooCam 3, etc.)
6
+ - GoPro MAX
7
+ - Ricoh Theta
8
+ - Samsung Gear 360
9
+
10
+ Detection methods:
11
+ - File extension (.insv, .insp for Insta360)
12
+ - Filename patterns (Q360_* for QooCam)
13
+ - Dual video streams with square resolution (unstitched fisheye)
14
+ - 2:1 aspect ratio (stitched equirectangular)
15
+ - Spherical metadata tags
16
+ - Handler names and make/model tags
17
+ """
18
+
19
+ import logging
20
+ import re
21
+ from pathlib import Path
22
+ from typing import Any
23
+
24
+ from media_engine.schemas import (
25
+ DetectionMethod,
26
+ DeviceInfo,
27
+ MediaDeviceType,
28
+ Metadata,
29
+ )
30
+
31
+ from .registry import get_tags_lower, register_extractor
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ # Known 360 camera identifiers
37
+ CAMERA_360_BRANDS = {
38
+ "insta360": {
39
+ "make": "Insta360",
40
+ "patterns": [r"INS", r"Insta360"],
41
+ "extensions": [".insv", ".insp"],
42
+ },
43
+ "kandao": {
44
+ "make": "Kandao",
45
+ "patterns": [r"Q360_", r"QooCam", r"Kandao"],
46
+ "extensions": [],
47
+ },
48
+ "gopro_max": {
49
+ "make": "GoPro",
50
+ "model": "MAX",
51
+ "patterns": [r"GoPro MAX", r"GPMAX"],
52
+ "extensions": [".360"],
53
+ },
54
+ "ricoh": {
55
+ "make": "Ricoh",
56
+ "patterns": [r"RICOH THETA", r"THETA"],
57
+ "extensions": [],
58
+ },
59
+ "samsung": {
60
+ "make": "Samsung",
61
+ "patterns": [r"Gear 360", r"SM-R210", r"SM-C200"],
62
+ "extensions": [],
63
+ },
64
+ }
65
+
66
+
67
+ class Camera360Extractor:
68
+ """Extract metadata from 360 cameras."""
69
+
70
+ def detect(self, probe_data: dict[str, Any], file_path: str) -> bool:
71
+ """Detect if this is a 360 camera file."""
72
+ path = Path(file_path)
73
+ tags = get_tags_lower(probe_data)
74
+
75
+ # Check file extensions
76
+ suffix_lower = path.suffix.lower()
77
+ for brand_info in CAMERA_360_BRANDS.values():
78
+ if suffix_lower in brand_info.get("extensions", []):
79
+ return True
80
+
81
+ # Check filename patterns
82
+ filename = path.name
83
+ for brand_info in CAMERA_360_BRANDS.values():
84
+ for pattern in brand_info.get("patterns", []):
85
+ if re.search(pattern, filename, re.IGNORECASE):
86
+ return True
87
+
88
+ # Check make/model tags
89
+ make = tags.get("make", "") or tags.get("manufacturer", "")
90
+ model = tags.get("model", "")
91
+ make_model = f"{make} {model}".strip()
92
+
93
+ for brand_info in CAMERA_360_BRANDS.values():
94
+ for pattern in brand_info.get("patterns", []):
95
+ if re.search(pattern, make_model, re.IGNORECASE):
96
+ return True
97
+
98
+ # Check handler_name for 360 camera identifiers
99
+ for stream in probe_data.get("streams", []):
100
+ handler = stream.get("tags", {}).get("handler_name", "")
101
+ for brand_info in CAMERA_360_BRANDS.values():
102
+ for pattern in brand_info.get("patterns", []):
103
+ if re.search(pattern, handler, re.IGNORECASE):
104
+ return True
105
+
106
+ # Check for spherical video metadata
107
+ if self._has_spherical_metadata(probe_data, tags):
108
+ return True
109
+
110
+ # Check for dual square video streams (unstitched 360)
111
+ if self._has_dual_fisheye_streams(probe_data):
112
+ return True
113
+
114
+ return False
115
+
116
+ def _has_spherical_metadata(self, probe_data: dict[str, Any], tags: dict[str, Any]) -> bool:
117
+ """Check for spherical/360 video metadata tags."""
118
+ # Check format tags
119
+ spherical_keys = [
120
+ "spherical",
121
+ "spherical-video",
122
+ "projection_type",
123
+ "stereo_mode",
124
+ "stitching_software",
125
+ ]
126
+ for key in spherical_keys:
127
+ if key in tags:
128
+ return True
129
+
130
+ # Check stream side_data for spherical projection
131
+ for stream in probe_data.get("streams", []):
132
+ side_data = stream.get("side_data_list", [])
133
+ for data in side_data:
134
+ if data.get("side_data_type") == "Spherical Mapping":
135
+ return True
136
+ if "spherical" in str(data).lower():
137
+ return True
138
+
139
+ return False
140
+
141
+ def _has_dual_fisheye_streams(self, probe_data: dict[str, Any]) -> bool:
142
+ """Check for dual video streams with square resolution (unstitched 360)."""
143
+ video_streams = [s for s in probe_data.get("streams", []) if s.get("codec_type") == "video"]
144
+
145
+ if len(video_streams) < 2:
146
+ return False
147
+
148
+ # Check if both streams are square (fisheye)
149
+ square_streams = 0
150
+ for stream in video_streams:
151
+ width = stream.get("width", 0)
152
+ height = stream.get("height", 0)
153
+ if width > 0 and width == height:
154
+ square_streams += 1
155
+
156
+ return square_streams >= 2
157
+
158
+ def extract(
159
+ self,
160
+ probe_data: dict[str, Any],
161
+ file_path: str,
162
+ base_metadata: Metadata,
163
+ ) -> Metadata:
164
+ """Extract 360 camera metadata."""
165
+ tags = get_tags_lower(probe_data)
166
+
167
+ # Detect brand and model
168
+ make, model = self._detect_brand_model(probe_data, file_path, tags)
169
+
170
+ # Detect if it's unstitched (dual fisheye) or stitched (equirectangular)
171
+ is_unstitched = self._has_dual_fisheye_streams(probe_data)
172
+
173
+ device = DeviceInfo(
174
+ make=make,
175
+ model=model,
176
+ type=MediaDeviceType.CAMERA_360,
177
+ detection_method=DetectionMethod.METADATA,
178
+ confidence=1.0,
179
+ )
180
+
181
+ # Add note about stitching status in software field
182
+ if is_unstitched:
183
+ device.software = "unstitched dual-fisheye"
184
+
185
+ base_metadata.device = device
186
+ return base_metadata
187
+
188
+ def _detect_brand_model(
189
+ self,
190
+ probe_data: dict[str, Any],
191
+ file_path: str,
192
+ tags: dict[str, Any],
193
+ ) -> tuple[str, str | None]:
194
+ """Detect 360 camera brand and model."""
195
+ path = Path(file_path)
196
+ filename = path.name
197
+ suffix_lower = path.suffix.lower()
198
+
199
+ # Check file extension first
200
+ for brand_key, brand_info in CAMERA_360_BRANDS.items():
201
+ if suffix_lower in brand_info.get("extensions", []):
202
+ model = self._detect_model_from_resolution(probe_data, brand_key)
203
+ return brand_info["make"], model
204
+
205
+ # Check filename patterns
206
+ if re.search(r"Q360_", filename):
207
+ model = self._detect_qoocam_model(probe_data)
208
+ return "Kandao", model
209
+
210
+ # Check make/model tags
211
+ make = tags.get("make", "") or tags.get("manufacturer", "")
212
+ model = tags.get("model", "")
213
+
214
+ if make:
215
+ # Normalize known brands
216
+ make_upper = make.upper()
217
+ if "INSTA" in make_upper:
218
+ return "Insta360", model or self._detect_model_from_resolution(probe_data, "insta360")
219
+ if "GOPRO" in make_upper:
220
+ return "GoPro", model or "MAX"
221
+ if "RICOH" in make_upper or "THETA" in make_upper:
222
+ return "Ricoh", model
223
+ if "SAMSUNG" in make_upper:
224
+ return "Samsung", model
225
+ if "KANDAO" in make_upper or "QOOCAM" in make_upper:
226
+ return "Kandao", model or self._detect_qoocam_model(probe_data)
227
+
228
+ return make, model if model else None
229
+
230
+ # Check handler for INS prefix (Insta360)
231
+ for stream in probe_data.get("streams", []):
232
+ handler = stream.get("tags", {}).get("handler_name", "")
233
+ if "INS" in handler.upper():
234
+ return "Insta360", self._detect_model_from_resolution(probe_data, "insta360")
235
+
236
+ # Fallback for detected 360 video
237
+ return "Unknown 360 Camera", None
238
+
239
+ def _detect_model_from_resolution(self, probe_data: dict[str, Any], brand: str) -> str | None:
240
+ """Detect model based on resolution."""
241
+ video_streams = [s for s in probe_data.get("streams", []) if s.get("codec_type") == "video"]
242
+
243
+ for stream in video_streams:
244
+ width = stream.get("width", 0)
245
+ height = stream.get("height", 0)
246
+
247
+ if brand == "insta360":
248
+ if width >= 3840 or height >= 3840:
249
+ return "X3/X4"
250
+ elif width >= 2880 or height >= 2880:
251
+ return "ONE RS"
252
+ elif width >= 1920 or height >= 1920:
253
+ return "ONE X/X2"
254
+
255
+ return None
256
+
257
+ def _detect_qoocam_model(self, probe_data: dict[str, Any]) -> str | None:
258
+ """Detect QooCam model based on resolution and codec."""
259
+ video_streams = [s for s in probe_data.get("streams", []) if s.get("codec_type") == "video"]
260
+
261
+ for stream in video_streams:
262
+ width = stream.get("width", 0)
263
+ codec = stream.get("codec_name", "")
264
+
265
+ if width >= 3840:
266
+ if codec == "hevc":
267
+ return "8K" # QooCam 8K uses HEVC
268
+ return "8K/3"
269
+ elif width >= 2880:
270
+ return "3"
271
+
272
+ return None
273
+
274
+
275
+ # Register the extractor
276
+ register_extractor("camera_360", Camera360Extractor())
@@ -0,0 +1,290 @@
1
+ """Canon metadata extraction.
2
+
3
+ Handles Canon cameras:
4
+ - Cinema EOS: C70, C300, C500, etc.
5
+ - EOS R series: R5, R6, R3, etc.
6
+ - DSLRs: 5D, 1DX, etc.
7
+
8
+ Detection methods:
9
+ - make tag: "Canon"
10
+ - XML sidecar files (.XML)
11
+
12
+ Canon XML sidecar files contain:
13
+ - Device info (Manufacturer, ModelName)
14
+ - GPS coordinates (Location element)
15
+ - Creation date (CreationDate element)
16
+
17
+ Canon Cinema EOS MXF filename format:
18
+ - Example: A012C001_230515_BY9X.MXF or A012C001_230515BY9X.MXF
19
+ - The YYMMDD date is embedded after the clip number
20
+ """
21
+
22
+ import logging
23
+ import re
24
+ import xml.etree.ElementTree as ET
25
+ from datetime import datetime, timezone
26
+ from pathlib import Path
27
+ from typing import Any
28
+
29
+ from media_engine.schemas import (
30
+ GPS,
31
+ DetectionMethod,
32
+ DeviceInfo,
33
+ MediaDeviceType,
34
+ Metadata,
35
+ )
36
+
37
+ from .base import SidecarMetadata
38
+ from .registry import get_tags_lower, register_extractor
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ # Pattern for Canon Cinema EOS MXF filenames with embedded date
43
+ # Format: A###C###H<YYMMDD><XX>_CANON.MXF
44
+ # Example: A012C001H200529BY_CANON.MXF -> date is 200529 (2020-05-29)
45
+ CANON_DATE_PATTERN = re.compile(r"H(\d{6})", re.IGNORECASE)
46
+
47
+
48
+ def _parse_date_from_filename(file_path: str) -> datetime | None:
49
+ """Extract recording date from Canon MXF filename.
50
+
51
+ Canon Cinema EOS cameras encode the date in the filename:
52
+ - A012C001_230515_BY9X.MXF -> 2023-05-15
53
+ - A012C001_230515BY9X.MXF -> 2023-05-15
54
+ - CLIP_230515.MXF -> 2023-05-15
55
+
56
+ The date format is YYMMDD (2-digit year, month, day).
57
+ """
58
+ filename = Path(file_path).stem
59
+
60
+ match = CANON_DATE_PATTERN.search(filename)
61
+ if not match:
62
+ return None
63
+
64
+ date_str = match.group(1)
65
+ try:
66
+ # Parse YYMMDD format
67
+ year = int(date_str[0:2])
68
+ month = int(date_str[2:4])
69
+ day = int(date_str[4:6])
70
+
71
+ # Convert 2-digit year to 4-digit (assume 20xx for now)
72
+ full_year = 2000 + year if year < 70 else 1900 + year
73
+
74
+ # Validate date components
75
+ if not (1 <= month <= 12 and 1 <= day <= 31):
76
+ return None
77
+
78
+ return datetime(full_year, month, day, tzinfo=timezone.utc)
79
+ except (ValueError, IndexError):
80
+ return None
81
+
82
+
83
+ def _parse_xml_sidecar(video_path: str) -> SidecarMetadata | None:
84
+ """Parse Canon XML sidecar file for additional metadata.
85
+
86
+ Canon cameras create XML sidecar files with naming pattern:
87
+ - Video: A012C001_230515_BY9X.MXF
88
+ - XML: A012C001_230515_BY9X.XML
89
+ """
90
+ path = Path(video_path)
91
+
92
+ xml_patterns = [
93
+ path.with_suffix(".XML"),
94
+ path.with_suffix(".xml"),
95
+ ]
96
+
97
+ xml_path = None
98
+ for pattern in xml_patterns:
99
+ if pattern.exists():
100
+ xml_path = pattern
101
+ break
102
+
103
+ if not xml_path:
104
+ return None
105
+
106
+ try:
107
+ tree = ET.parse(xml_path)
108
+ root = tree.getroot()
109
+
110
+ ns = {"canon": "http://www.canon.com/ns/VideoClip"}
111
+
112
+ device: DeviceInfo | None = None
113
+ gps: GPS | None = None
114
+ created_at: datetime | None = None
115
+
116
+ # Extract device info
117
+ device_elem = root.find(".//canon:Device", ns) or root.find(".//{*}Device")
118
+ if device_elem is not None:
119
+ manufacturer_elem = device_elem.find("canon:Manufacturer", ns) or device_elem.find("{*}Manufacturer")
120
+ model_elem = device_elem.find("canon:ModelName", ns) or device_elem.find("{*}ModelName")
121
+
122
+ manufacturer = manufacturer_elem.text if manufacturer_elem is not None else None
123
+ model_name = model_elem.text if model_elem is not None else None
124
+
125
+ if manufacturer or model_name:
126
+ device = DeviceInfo(
127
+ make=manufacturer,
128
+ model=model_name,
129
+ software=None,
130
+ type=MediaDeviceType.CAMERA,
131
+ detection_method=DetectionMethod.XML_SIDECAR,
132
+ confidence=1.0,
133
+ )
134
+
135
+ # Extract creation date - try multiple possible element names
136
+ date_elements = [
137
+ ".//canon:CreationDate",
138
+ ".//canon:StartDate",
139
+ ".//canon:Date",
140
+ ".//{*}CreationDate",
141
+ ".//{*}StartDate",
142
+ ".//{*}Date",
143
+ ]
144
+ for date_xpath in date_elements:
145
+ if date_xpath.startswith(".//canon:"):
146
+ date_elem = root.find(date_xpath, ns)
147
+ else:
148
+ date_elem = root.find(date_xpath)
149
+
150
+ if date_elem is not None and date_elem.text:
151
+ try:
152
+ # Try ISO format first (2023-05-15T10:30:00)
153
+ date_text = date_elem.text.strip()
154
+ if "T" in date_text:
155
+ created_at = datetime.fromisoformat(date_text.replace("Z", "+00:00"))
156
+ else:
157
+ # Try date only (2023-05-15)
158
+ created_at = datetime.strptime(date_text, "%Y-%m-%d").replace(tzinfo=timezone.utc)
159
+ break
160
+ except ValueError:
161
+ continue
162
+
163
+ # Extract GPS from Location element
164
+ location_elem = root.find(".//canon:Location", ns) or root.find(".//{*}Location")
165
+ if location_elem is not None:
166
+ lat_elem = location_elem.find("canon:Latitude", ns) or location_elem.find("{*}Latitude")
167
+ lon_elem = location_elem.find("canon:Longitude", ns) or location_elem.find("{*}Longitude")
168
+ alt_elem = location_elem.find("canon:Altitude", ns) or location_elem.find("{*}Altitude")
169
+
170
+ lat = lat_elem.text if lat_elem is not None and lat_elem.text else None
171
+ lon = lon_elem.text if lon_elem is not None and lon_elem.text else None
172
+ alt = alt_elem.text if alt_elem is not None and alt_elem.text else None
173
+
174
+ if lat and lon:
175
+ try:
176
+ gps = GPS(
177
+ latitude=float(lat),
178
+ longitude=float(lon),
179
+ altitude=float(alt) if alt else None,
180
+ )
181
+ except ValueError:
182
+ pass
183
+
184
+ if device or gps or created_at:
185
+ return SidecarMetadata(device=device, gps=gps, created_at=created_at)
186
+ return None
187
+
188
+ except ET.ParseError as e:
189
+ logger.warning(f"Failed to parse Canon XML sidecar {xml_path}: {e}")
190
+ return None
191
+ except Exception as e:
192
+ logger.warning(f"Error reading Canon XML sidecar {xml_path}: {e}")
193
+ return None
194
+
195
+
196
+ class CanonExtractor:
197
+ """Metadata extractor for Canon cameras."""
198
+
199
+ def detect(self, probe_data: dict[str, Any], file_path: str) -> bool:
200
+ """Detect if file is from a Canon camera."""
201
+ tags = get_tags_lower(probe_data)
202
+
203
+ # Check make tag (various names used by different formats)
204
+ make = tags.get("make") or tags.get("manufacturer") or tags.get("company_name")
205
+ if make and "CANON" in make.upper():
206
+ return True
207
+
208
+ # Check for Canon XML sidecar
209
+ path = Path(file_path)
210
+ xml_patterns = [
211
+ path.with_suffix(".XML"),
212
+ path.with_suffix(".xml"),
213
+ ]
214
+ for pattern in xml_patterns:
215
+ if pattern.exists():
216
+ try:
217
+ tree = ET.parse(pattern)
218
+ root = tree.getroot()
219
+ # Check for Canon namespace
220
+ if "canon.com" in str(root.tag).lower():
221
+ return True
222
+ # Check device manufacturer
223
+ device = root.find(".//{*}Device")
224
+ if device is not None:
225
+ mfr_elem = device.find(".//{*}Manufacturer")
226
+ if mfr_elem is not None and mfr_elem.text:
227
+ if "Canon" in mfr_elem.text:
228
+ return True
229
+ except Exception:
230
+ pass
231
+
232
+ return False
233
+
234
+ def extract(self, probe_data: dict[str, Any], file_path: str, base_metadata: Metadata) -> Metadata:
235
+ """Extract Canon-specific metadata."""
236
+ tags = get_tags_lower(probe_data)
237
+
238
+ # Get basic device info from tags
239
+ make = tags.get("make") or tags.get("manufacturer") or "Canon"
240
+ model = tags.get("model") or tags.get("model_name")
241
+
242
+ # Parse XML sidecar for detailed metadata
243
+ sidecar = _parse_xml_sidecar(file_path)
244
+
245
+ # Build device info (prefer sidecar)
246
+ if sidecar and sidecar.device:
247
+ device = sidecar.device
248
+ else:
249
+ device = DeviceInfo(
250
+ make=make if make else "Canon",
251
+ model=model,
252
+ software=tags.get("software"),
253
+ type=MediaDeviceType.CAMERA,
254
+ detection_method=DetectionMethod.METADATA,
255
+ confidence=1.0,
256
+ )
257
+
258
+ # Merge metadata
259
+ gps = sidecar.gps if sidecar and sidecar.gps else base_metadata.gps
260
+
261
+ # Get creation date: prefer base_metadata, then sidecar, then filename
262
+ created_at = base_metadata.created_at
263
+ if created_at is None and sidecar and sidecar.created_at:
264
+ created_at = sidecar.created_at
265
+ logger.debug(f"Got creation date from XML sidecar: {created_at}")
266
+ if created_at is None:
267
+ created_at = _parse_date_from_filename(file_path)
268
+ if created_at:
269
+ logger.debug(f"Parsed creation date from filename: {created_at}")
270
+
271
+ return Metadata(
272
+ duration=base_metadata.duration,
273
+ resolution=base_metadata.resolution,
274
+ codec=base_metadata.codec,
275
+ video_codec=base_metadata.video_codec,
276
+ audio=base_metadata.audio,
277
+ fps=base_metadata.fps,
278
+ bitrate=base_metadata.bitrate,
279
+ file_size=base_metadata.file_size,
280
+ timecode=base_metadata.timecode,
281
+ created_at=created_at,
282
+ device=device,
283
+ gps=gps,
284
+ color_space=base_metadata.color_space,
285
+ lens=base_metadata.lens,
286
+ )
287
+
288
+
289
+ # Register this extractor
290
+ register_extractor("canon", CanonExtractor())