cloudflare-images-migrator 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloudflare_images_migrator-1.0.0.dist-info/METADATA +474 -0
- cloudflare_images_migrator-1.0.0.dist-info/RECORD +17 -0
- cloudflare_images_migrator-1.0.0.dist-info/WHEEL +5 -0
- cloudflare_images_migrator-1.0.0.dist-info/entry_points.txt +3 -0
- cloudflare_images_migrator-1.0.0.dist-info/licenses/LICENSE +21 -0
- cloudflare_images_migrator-1.0.0.dist-info/top_level.txt +1 -0
- src/__init__.py +1 -0
- src/audit.py +620 -0
- src/cloudflare_client.py +746 -0
- src/config.py +161 -0
- src/image_tracker.py +405 -0
- src/logger.py +160 -0
- src/migrator.py +491 -0
- src/parsers.py +609 -0
- src/quality.py +558 -0
- src/security.py +528 -0
- src/utils.py +355 -0
src/parsers.py
ADDED
@@ -0,0 +1,609 @@
|
|
1
|
+
"""
|
2
|
+
Image reference parsers for different file formats
|
3
|
+
"""
|
4
|
+
|
5
|
+
import re
|
6
|
+
import json
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import List, Dict, Set, Tuple, Optional
|
9
|
+
from bs4 import BeautifulSoup
|
10
|
+
from urllib.parse import urljoin, urlparse
|
11
|
+
|
12
|
+
from .utils import is_url, normalize_path, safe_read_file
|
13
|
+
|
14
|
+
|
15
|
+
class ImageReference:
|
16
|
+
"""Represents an image reference found in a file."""
|
17
|
+
|
18
|
+
def __init__(self, path: str, line_number: int, column: int,
|
19
|
+
context: str, ref_type: str, original_text: str):
|
20
|
+
self.path = path # Image path or URL
|
21
|
+
self.line_number = line_number
|
22
|
+
self.column = column
|
23
|
+
self.context = context # Surrounding text for replacement
|
24
|
+
self.ref_type = ref_type # 'local', 'url', 'data'
|
25
|
+
self.original_text = original_text # Original reference text
|
26
|
+
self.is_url = is_url(path)
|
27
|
+
|
28
|
+
def __str__(self):
|
29
|
+
return f"ImageRef({self.path}, line {self.line_number}, {self.ref_type})"
|
30
|
+
|
31
|
+
def __repr__(self):
|
32
|
+
return self.__str__()
|
33
|
+
|
34
|
+
|
35
|
+
class BaseParser:
|
36
|
+
"""Base class for image reference parsers."""
|
37
|
+
|
38
|
+
def __init__(self, supported_formats: List[str]):
|
39
|
+
self.supported_formats = [fmt.lower() for fmt in supported_formats]
|
40
|
+
|
41
|
+
def can_parse(self, file_path: Path) -> bool:
|
42
|
+
"""Check if this parser can handle the given file."""
|
43
|
+
return file_path.suffix.lower() in self.supported_formats
|
44
|
+
|
45
|
+
def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
|
46
|
+
"""Parse a file and extract image references."""
|
47
|
+
raise NotImplementedError
|
48
|
+
|
49
|
+
|
50
|
+
class HTMLParser(BaseParser):
|
51
|
+
"""Parser for HTML and similar markup files."""
|
52
|
+
|
53
|
+
def __init__(self):
|
54
|
+
super().__init__(['.html', '.htm', '.xhtml', '.xml'])
|
55
|
+
|
56
|
+
def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
|
57
|
+
if content is None:
|
58
|
+
content = safe_read_file(file_path)
|
59
|
+
if content is None:
|
60
|
+
return []
|
61
|
+
|
62
|
+
references = []
|
63
|
+
|
64
|
+
try:
|
65
|
+
soup = BeautifulSoup(content, 'html.parser')
|
66
|
+
|
67
|
+
# Find img tags
|
68
|
+
for img in soup.find_all('img'):
|
69
|
+
src = img.get('src')
|
70
|
+
if src:
|
71
|
+
line_num = self._find_line_number(content, str(img))
|
72
|
+
references.append(ImageReference(
|
73
|
+
path=src,
|
74
|
+
line_number=line_num,
|
75
|
+
column=0,
|
76
|
+
context=str(img),
|
77
|
+
ref_type='url' if is_url(src) else 'local',
|
78
|
+
original_text=str(img)
|
79
|
+
))
|
80
|
+
|
81
|
+
# Find CSS background images in style attributes
|
82
|
+
for element in soup.find_all(attrs={'style': True}):
|
83
|
+
style = element.get('style', '')
|
84
|
+
css_refs = self._parse_css_urls(style)
|
85
|
+
for ref in css_refs:
|
86
|
+
line_num = self._find_line_number(content, str(element))
|
87
|
+
references.append(ImageReference(
|
88
|
+
path=ref,
|
89
|
+
line_number=line_num,
|
90
|
+
column=0,
|
91
|
+
context=str(element),
|
92
|
+
ref_type='url' if is_url(ref) else 'local',
|
93
|
+
original_text=style
|
94
|
+
))
|
95
|
+
|
96
|
+
# Find inline CSS
|
97
|
+
for style_tag in soup.find_all('style'):
|
98
|
+
if style_tag.string:
|
99
|
+
css_refs = self._parse_css_urls(style_tag.string)
|
100
|
+
for ref in css_refs:
|
101
|
+
line_num = self._find_line_number(content, style_tag.string)
|
102
|
+
references.append(ImageReference(
|
103
|
+
path=ref,
|
104
|
+
line_number=line_num,
|
105
|
+
column=0,
|
106
|
+
context=style_tag.string,
|
107
|
+
ref_type='url' if is_url(ref) else 'local',
|
108
|
+
original_text=style_tag.string
|
109
|
+
))
|
110
|
+
|
111
|
+
except Exception as e:
|
112
|
+
# Fallback to regex parsing if BeautifulSoup fails
|
113
|
+
references = self._regex_parse(content)
|
114
|
+
|
115
|
+
return references
|
116
|
+
|
117
|
+
def _parse_css_urls(self, css_content: str) -> List[str]:
|
118
|
+
"""Extract URLs from CSS content."""
|
119
|
+
url_pattern = r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
|
120
|
+
matches = re.findall(url_pattern, css_content, re.IGNORECASE)
|
121
|
+
return [match for match in matches if self._is_image_url(match)]
|
122
|
+
|
123
|
+
def _is_image_url(self, url: str) -> bool:
|
124
|
+
"""Enhanced image URL detection - catches way more image URLs."""
|
125
|
+
if not url or len(url.strip()) == 0:
|
126
|
+
return False
|
127
|
+
|
128
|
+
url_lower = url.lower().strip()
|
129
|
+
|
130
|
+
# Traditional image extensions
|
131
|
+
image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico', '.avif', '.heic']
|
132
|
+
if any(url_lower.endswith(ext) for ext in image_extensions):
|
133
|
+
return True
|
134
|
+
|
135
|
+
# Data URLs
|
136
|
+
if url_lower.startswith('data:image'):
|
137
|
+
return True
|
138
|
+
|
139
|
+
# Common image hosting domains and services
|
140
|
+
image_domains = [
|
141
|
+
'img.shields.io', # Badge service
|
142
|
+
'github.com', # GitHub assets
|
143
|
+
'githubusercontent.com', # GitHub raw content
|
144
|
+
'imagedelivery.net', # Cloudflare Images
|
145
|
+
'imgur.com', # Imgur
|
146
|
+
'gravatar.com', # Gravatar
|
147
|
+
'unsplash.com', # Unsplash
|
148
|
+
'pexels.com', # Pexels
|
149
|
+
'pixabay.com', # Pixabay
|
150
|
+
'jsdelivr.net', # CDN images
|
151
|
+
'cloudfront.net', # AWS CloudFront
|
152
|
+
'fastly.com', # Fastly CDN
|
153
|
+
'shopify.com', # Shopify assets
|
154
|
+
'squarespace.com', # Squarespace assets
|
155
|
+
'wix.com', # Wix assets
|
156
|
+
'wordpress.com', # WordPress assets
|
157
|
+
'medium.com', # Medium images
|
158
|
+
'assets', # Generic assets path
|
159
|
+
]
|
160
|
+
|
161
|
+
# Check if it's from known image hosting domains
|
162
|
+
parsed_url = urlparse(url_lower)
|
163
|
+
if parsed_url.netloc:
|
164
|
+
for domain in image_domains:
|
165
|
+
if domain in parsed_url.netloc:
|
166
|
+
return True
|
167
|
+
|
168
|
+
# GitHub asset patterns (specific patterns for GitHub)
|
169
|
+
if 'github.com' in url_lower and '/assets/' in url_lower:
|
170
|
+
return True
|
171
|
+
if 'githubusercontent.com' in url_lower:
|
172
|
+
return True
|
173
|
+
|
174
|
+
# Badge services (img.shields.io, badgen.net, etc.)
|
175
|
+
badge_indicators = ['badge', 'shield', 'logo=', 'style=', 'color=']
|
176
|
+
if any(indicator in url_lower for indicator in badge_indicators):
|
177
|
+
return True
|
178
|
+
|
179
|
+
# SVG files in URLs (often don't have .svg extension)
|
180
|
+
if 'svg' in url_lower:
|
181
|
+
return True
|
182
|
+
|
183
|
+
# Image-like paths (contain image keywords)
|
184
|
+
image_keywords = ['/images/', '/img/', '/pics/', '/photos/', '/assets/', '/media/',
|
185
|
+
'/uploads/', '/content/', '/static/', '/public/', 'icon', 'logo',
|
186
|
+
'banner', 'avatar', 'thumbnail', 'cover', 'screenshot']
|
187
|
+
if any(keyword in url_lower for keyword in image_keywords):
|
188
|
+
return True
|
189
|
+
|
190
|
+
# URLs with image-like query parameters
|
191
|
+
if '?' in url and any(param in url_lower for param in ['width=', 'height=', 'size=', 'format=', 'quality=']):
|
192
|
+
return True
|
193
|
+
|
194
|
+
return False
|
195
|
+
|
196
|
+
def _find_line_number(self, content: str, search_text: str) -> int:
|
197
|
+
"""Find line number of text in content."""
|
198
|
+
try:
|
199
|
+
index = content.find(search_text)
|
200
|
+
if index != -1:
|
201
|
+
return content[:index].count('\n') + 1
|
202
|
+
except Exception:
|
203
|
+
pass
|
204
|
+
return 1
|
205
|
+
|
206
|
+
def _regex_parse(self, content: str) -> List[ImageReference]:
|
207
|
+
"""Fallback regex parsing."""
|
208
|
+
references = []
|
209
|
+
|
210
|
+
# HTML img src pattern
|
211
|
+
img_pattern = r'<img[^>]+src\s*=\s*["\']([^"\']+)["\'][^>]*>'
|
212
|
+
for match in re.finditer(img_pattern, content, re.IGNORECASE):
|
213
|
+
src = match.group(1)
|
214
|
+
if self._is_image_url(src):
|
215
|
+
line_num = content[:match.start()].count('\n') + 1
|
216
|
+
references.append(ImageReference(
|
217
|
+
path=src,
|
218
|
+
line_number=line_num,
|
219
|
+
column=match.start(),
|
220
|
+
context=match.group(0),
|
221
|
+
ref_type='url' if is_url(src) else 'local',
|
222
|
+
original_text=match.group(0)
|
223
|
+
))
|
224
|
+
|
225
|
+
return references
|
226
|
+
|
227
|
+
|
228
|
+
class CSSParser(BaseParser):
|
229
|
+
"""Parser for CSS files."""
|
230
|
+
|
231
|
+
def __init__(self):
|
232
|
+
super().__init__(['.css', '.scss', '.sass', '.less'])
|
233
|
+
|
234
|
+
def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
|
235
|
+
if content is None:
|
236
|
+
content = safe_read_file(file_path)
|
237
|
+
if content is None:
|
238
|
+
return []
|
239
|
+
|
240
|
+
references = []
|
241
|
+
|
242
|
+
# URL pattern for CSS
|
243
|
+
url_pattern = r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
|
244
|
+
|
245
|
+
for match in re.finditer(url_pattern, content, re.IGNORECASE):
|
246
|
+
url = match.group(1)
|
247
|
+
if self._is_image_url(url):
|
248
|
+
line_num = content[:match.start()].count('\n') + 1
|
249
|
+
references.append(ImageReference(
|
250
|
+
path=url,
|
251
|
+
line_number=line_num,
|
252
|
+
column=match.start() - content.rfind('\n', 0, match.start()),
|
253
|
+
context=match.group(0),
|
254
|
+
ref_type='url' if is_url(url) else 'local',
|
255
|
+
original_text=match.group(0)
|
256
|
+
))
|
257
|
+
|
258
|
+
return references
|
259
|
+
|
260
|
+
def _is_image_url(self, url: str) -> bool:
|
261
|
+
"""Enhanced image URL detection - catches way more image URLs."""
|
262
|
+
if not url or len(url.strip()) == 0:
|
263
|
+
return False
|
264
|
+
|
265
|
+
url_lower = url.lower().strip()
|
266
|
+
|
267
|
+
# Traditional image extensions
|
268
|
+
image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico', '.avif', '.heic']
|
269
|
+
if any(url_lower.endswith(ext) for ext in image_extensions):
|
270
|
+
return True
|
271
|
+
|
272
|
+
# Data URLs
|
273
|
+
if url_lower.startswith('data:image'):
|
274
|
+
return True
|
275
|
+
|
276
|
+
# Common image hosting domains and services
|
277
|
+
image_domains = [
|
278
|
+
'img.shields.io', # Badge service
|
279
|
+
'github.com', # GitHub assets
|
280
|
+
'githubusercontent.com', # GitHub raw content
|
281
|
+
'imagedelivery.net', # Cloudflare Images
|
282
|
+
'imgur.com', # Imgur
|
283
|
+
'gravatar.com', # Gravatar
|
284
|
+
'unsplash.com', # Unsplash
|
285
|
+
'pexels.com', # Pexels
|
286
|
+
'pixabay.com', # Pixabay
|
287
|
+
'jsdelivr.net', # CDN images
|
288
|
+
'cloudfront.net', # AWS CloudFront
|
289
|
+
'fastly.com', # Fastly CDN
|
290
|
+
'shopify.com', # Shopify assets
|
291
|
+
'squarespace.com', # Squarespace assets
|
292
|
+
'wix.com', # Wix assets
|
293
|
+
'wordpress.com', # WordPress assets
|
294
|
+
'medium.com', # Medium images
|
295
|
+
'assets', # Generic assets path
|
296
|
+
]
|
297
|
+
|
298
|
+
# Check if it's from known image hosting domains
|
299
|
+
parsed_url = urlparse(url_lower)
|
300
|
+
if parsed_url.netloc:
|
301
|
+
for domain in image_domains:
|
302
|
+
if domain in parsed_url.netloc:
|
303
|
+
return True
|
304
|
+
|
305
|
+
# GitHub asset patterns (specific patterns for GitHub)
|
306
|
+
if 'github.com' in url_lower and '/assets/' in url_lower:
|
307
|
+
return True
|
308
|
+
if 'githubusercontent.com' in url_lower:
|
309
|
+
return True
|
310
|
+
|
311
|
+
# Badge services (img.shields.io, badgen.net, etc.)
|
312
|
+
badge_indicators = ['badge', 'shield', 'logo=', 'style=', 'color=']
|
313
|
+
if any(indicator in url_lower for indicator in badge_indicators):
|
314
|
+
return True
|
315
|
+
|
316
|
+
# SVG files in URLs (often don't have .svg extension)
|
317
|
+
if 'svg' in url_lower:
|
318
|
+
return True
|
319
|
+
|
320
|
+
# Image-like paths (contain image keywords)
|
321
|
+
image_keywords = ['/images/', '/img/', '/pics/', '/photos/', '/assets/', '/media/',
|
322
|
+
'/uploads/', '/content/', '/static/', '/public/', 'icon', 'logo',
|
323
|
+
'banner', 'avatar', 'thumbnail', 'cover', 'screenshot']
|
324
|
+
if any(keyword in url_lower for keyword in image_keywords):
|
325
|
+
return True
|
326
|
+
|
327
|
+
# URLs with image-like query parameters
|
328
|
+
if '?' in url and any(param in url_lower for param in ['width=', 'height=', 'size=', 'format=', 'quality=']):
|
329
|
+
return True
|
330
|
+
|
331
|
+
return False
|
332
|
+
|
333
|
+
|
334
|
+
class JavaScriptParser(BaseParser):
|
335
|
+
"""Parser for JavaScript/TypeScript files."""
|
336
|
+
|
337
|
+
def __init__(self):
|
338
|
+
super().__init__(['.js', '.jsx', '.ts', '.tsx', '.mjs'])
|
339
|
+
|
340
|
+
def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
|
341
|
+
if content is None:
|
342
|
+
content = safe_read_file(file_path)
|
343
|
+
if content is None:
|
344
|
+
return []
|
345
|
+
|
346
|
+
references = []
|
347
|
+
|
348
|
+
# String literals that might contain image paths
|
349
|
+
patterns = [
|
350
|
+
# Import statements
|
351
|
+
r'import\s+.*?from\s+["\']([^"\']+\.(png|jpg|jpeg|gif|webp|svg))["\']',
|
352
|
+
# Require statements
|
353
|
+
r'require\s*\(\s*["\']([^"\']+\.(png|jpg|jpeg|gif|webp|svg))["\']',
|
354
|
+
# String literals with image extensions
|
355
|
+
r'["\']([^"\']*\.(png|jpg|jpeg|gif|webp|svg))["\']',
|
356
|
+
# JSX img src
|
357
|
+
r'<img[^>]+src\s*=\s*[{"\'][^}"\']*["\']([^"\']+)["\']',
|
358
|
+
# React Image component
|
359
|
+
r'<Image[^>]+src\s*=\s*[{"\'][^}"\']*["\']([^"\']+)["\']',
|
360
|
+
]
|
361
|
+
|
362
|
+
for pattern in patterns:
|
363
|
+
for match in re.finditer(pattern, content, re.IGNORECASE):
|
364
|
+
# Get the image path (usually the first capture group)
|
365
|
+
img_path = match.group(1) if match.groups() else match.group(0)
|
366
|
+
|
367
|
+
if self._is_image_path(img_path):
|
368
|
+
line_num = content[:match.start()].count('\n') + 1
|
369
|
+
references.append(ImageReference(
|
370
|
+
path=img_path,
|
371
|
+
line_number=line_num,
|
372
|
+
column=match.start() - content.rfind('\n', 0, match.start()),
|
373
|
+
context=match.group(0),
|
374
|
+
ref_type='url' if is_url(img_path) else 'local',
|
375
|
+
original_text=match.group(0)
|
376
|
+
))
|
377
|
+
|
378
|
+
return references
|
379
|
+
|
380
|
+
def _is_image_path(self, path: str) -> bool:
|
381
|
+
"""Check if path points to an image."""
|
382
|
+
image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico']
|
383
|
+
path_lower = path.lower()
|
384
|
+
return any(path_lower.endswith(ext) for ext in image_extensions)
|
385
|
+
|
386
|
+
|
387
|
+
class MarkdownParser(BaseParser):
|
388
|
+
"""Parser for Markdown files."""
|
389
|
+
|
390
|
+
def __init__(self):
|
391
|
+
super().__init__(['.md', '.markdown', '.mdown', '.mkd'])
|
392
|
+
|
393
|
+
def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
|
394
|
+
if content is None:
|
395
|
+
content = safe_read_file(file_path)
|
396
|
+
if content is None:
|
397
|
+
return []
|
398
|
+
|
399
|
+
references = []
|
400
|
+
|
401
|
+
# Enhanced Markdown image patterns
|
402
|
+
patterns = [
|
403
|
+
#  or 
|
404
|
+
r'!\[([^\]]*)\]\(([^)]+)\)',
|
405
|
+
# ![alt][ref] with [ref]: image.jpg
|
406
|
+
r'!\[([^\]]*)\]\[([^\]]+)\]',
|
407
|
+
# HTML img tags in markdown - enhanced to catch more variations
|
408
|
+
r'<img[^>]+src\s*=\s*["\']([^"\']+)["\'][^>]*>',
|
409
|
+
# img src= without img tag (common in HTML snippets)
|
410
|
+
r'src\s*=\s*["\']([^"\']+)["\']',
|
411
|
+
]
|
412
|
+
|
413
|
+
for pattern in patterns:
|
414
|
+
for match in re.finditer(pattern, content, re.IGNORECASE | re.MULTILINE):
|
415
|
+
if pattern.startswith('!\\['):
|
416
|
+
# Standard markdown image
|
417
|
+
img_path = match.group(2)
|
418
|
+
elif pattern.startswith('<img') or pattern.startswith('src'):
|
419
|
+
# HTML img tag or src attribute
|
420
|
+
img_path = match.group(1)
|
421
|
+
else:
|
422
|
+
continue
|
423
|
+
|
424
|
+
# Enhanced image detection - much more permissive
|
425
|
+
if self._is_image_path(img_path):
|
426
|
+
line_num = content[:match.start()].count('\n') + 1
|
427
|
+
references.append(ImageReference(
|
428
|
+
path=img_path,
|
429
|
+
line_number=line_num,
|
430
|
+
column=match.start() - content.rfind('\n', 0, match.start()),
|
431
|
+
context=match.group(0),
|
432
|
+
ref_type='url' if is_url(img_path) else 'local',
|
433
|
+
original_text=match.group(0)
|
434
|
+
))
|
435
|
+
|
436
|
+
return references
|
437
|
+
|
438
|
+
def _is_image_path(self, path: str) -> bool:
|
439
|
+
"""Enhanced image detection - catches way more image URLs."""
|
440
|
+
if not path or len(path.strip()) == 0:
|
441
|
+
return False
|
442
|
+
|
443
|
+
path_lower = path.lower().strip()
|
444
|
+
|
445
|
+
# Traditional image extensions
|
446
|
+
image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico', '.avif', '.heic']
|
447
|
+
if any(path_lower.endswith(ext) for ext in image_extensions):
|
448
|
+
return True
|
449
|
+
|
450
|
+
# Data URLs
|
451
|
+
if path_lower.startswith('data:image'):
|
452
|
+
return True
|
453
|
+
|
454
|
+
# Common image hosting domains and services
|
455
|
+
image_domains = [
|
456
|
+
'img.shields.io', # Badge service
|
457
|
+
'github.com', # GitHub assets
|
458
|
+
'githubusercontent.com', # GitHub raw content
|
459
|
+
'imagedelivery.net', # Cloudflare Images
|
460
|
+
'imgur.com', # Imgur
|
461
|
+
'gravatar.com', # Gravatar
|
462
|
+
'unsplash.com', # Unsplash
|
463
|
+
'pexels.com', # Pexels
|
464
|
+
'pixabay.com', # Pixabay
|
465
|
+
'jsdelivr.net', # CDN images
|
466
|
+
'cloudfront.net', # AWS CloudFront
|
467
|
+
'fastly.com', # Fastly CDN
|
468
|
+
'shopify.com', # Shopify assets
|
469
|
+
'squarespace.com', # Squarespace assets
|
470
|
+
'wix.com', # Wix assets
|
471
|
+
'wordpress.com', # WordPress assets
|
472
|
+
'medium.com', # Medium images
|
473
|
+
'assets', # Generic assets path
|
474
|
+
]
|
475
|
+
|
476
|
+
# Check if it's from known image hosting domains
|
477
|
+
parsed_url = urlparse(path_lower)
|
478
|
+
if parsed_url.netloc:
|
479
|
+
for domain in image_domains:
|
480
|
+
if domain in parsed_url.netloc:
|
481
|
+
return True
|
482
|
+
|
483
|
+
# GitHub asset patterns (specific patterns for GitHub)
|
484
|
+
if 'github.com' in path_lower and '/assets/' in path_lower:
|
485
|
+
return True
|
486
|
+
if 'githubusercontent.com' in path_lower:
|
487
|
+
return True
|
488
|
+
|
489
|
+
# Badge services (img.shields.io, badgen.net, etc.)
|
490
|
+
badge_indicators = ['badge', 'shield', 'logo=', 'style=', 'color=']
|
491
|
+
if any(indicator in path_lower for indicator in badge_indicators):
|
492
|
+
return True
|
493
|
+
|
494
|
+
# SVG files in URLs (often don't have .svg extension)
|
495
|
+
if 'svg' in path_lower:
|
496
|
+
return True
|
497
|
+
|
498
|
+
# Image-like paths (contain image keywords)
|
499
|
+
image_keywords = ['/images/', '/img/', '/pics/', '/photos/', '/assets/', '/media/',
|
500
|
+
'/uploads/', '/content/', '/static/', '/public/', 'icon', 'logo',
|
501
|
+
'banner', 'avatar', 'thumbnail', 'cover', 'screenshot']
|
502
|
+
if any(keyword in path_lower for keyword in image_keywords):
|
503
|
+
return True
|
504
|
+
|
505
|
+
# URLs with image-like query parameters
|
506
|
+
if '?' in path and any(param in path_lower for param in ['width=', 'height=', 'size=', 'format=', 'quality=']):
|
507
|
+
return True
|
508
|
+
|
509
|
+
return False
|
510
|
+
|
511
|
+
|
512
|
+
class JSONParser(BaseParser):
|
513
|
+
"""Parser for JSON configuration files."""
|
514
|
+
|
515
|
+
def __init__(self):
|
516
|
+
super().__init__(['.json'])
|
517
|
+
|
518
|
+
def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
|
519
|
+
if content is None:
|
520
|
+
content = safe_read_file(file_path)
|
521
|
+
if content is None:
|
522
|
+
return []
|
523
|
+
|
524
|
+
references = []
|
525
|
+
|
526
|
+
try:
|
527
|
+
# Parse JSON and look for image references
|
528
|
+
data = json.loads(content)
|
529
|
+
self._extract_from_json(data, content, references)
|
530
|
+
except json.JSONDecodeError:
|
531
|
+
# Fallback to regex parsing
|
532
|
+
pattern = r'["\']([^"\']*\.(png|jpg|jpeg|gif|webp|svg))["\']'
|
533
|
+
for match in re.finditer(pattern, content, re.IGNORECASE):
|
534
|
+
img_path = match.group(1)
|
535
|
+
line_num = content[:match.start()].count('\n') + 1
|
536
|
+
references.append(ImageReference(
|
537
|
+
path=img_path,
|
538
|
+
line_number=line_num,
|
539
|
+
column=match.start() - content.rfind('\n', 0, match.start()),
|
540
|
+
context=match.group(0),
|
541
|
+
ref_type='url' if is_url(img_path) else 'local',
|
542
|
+
original_text=match.group(0)
|
543
|
+
))
|
544
|
+
|
545
|
+
return references
|
546
|
+
|
547
|
+
def _extract_from_json(self, data, content: str, references: List[ImageReference], path: str = ""):
|
548
|
+
"""Recursively extract image references from JSON data."""
|
549
|
+
if isinstance(data, dict):
|
550
|
+
for key, value in data.items():
|
551
|
+
self._extract_from_json(value, content, references, f"{path}.{key}")
|
552
|
+
elif isinstance(data, list):
|
553
|
+
for i, item in enumerate(data):
|
554
|
+
self._extract_from_json(item, content, references, f"{path}[{i}]")
|
555
|
+
elif isinstance(data, str) and self._is_image_path(data):
|
556
|
+
# Find the string in the content to get line number
|
557
|
+
escaped_data = re.escape(data)
|
558
|
+
pattern = f'["\']({escaped_data})["\']'
|
559
|
+
match = re.search(pattern, content)
|
560
|
+
if match:
|
561
|
+
line_num = content[:match.start()].count('\n') + 1
|
562
|
+
references.append(ImageReference(
|
563
|
+
path=data,
|
564
|
+
line_number=line_num,
|
565
|
+
column=match.start() - content.rfind('\n', 0, match.start()),
|
566
|
+
context=match.group(0),
|
567
|
+
ref_type='url' if is_url(data) else 'local',
|
568
|
+
original_text=match.group(0)
|
569
|
+
))
|
570
|
+
|
571
|
+
def _is_image_path(self, path: str) -> bool:
|
572
|
+
"""Check if path points to an image."""
|
573
|
+
image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico']
|
574
|
+
path_lower = path.lower()
|
575
|
+
return any(path_lower.endswith(ext) for ext in image_extensions)
|
576
|
+
|
577
|
+
|
578
|
+
class ParserRegistry:
|
579
|
+
"""Registry for all image reference parsers."""
|
580
|
+
|
581
|
+
def __init__(self):
|
582
|
+
self.parsers = [
|
583
|
+
HTMLParser(),
|
584
|
+
CSSParser(),
|
585
|
+
JavaScriptParser(),
|
586
|
+
MarkdownParser(),
|
587
|
+
JSONParser(),
|
588
|
+
]
|
589
|
+
|
590
|
+
def get_parser(self, file_path: Path) -> Optional[BaseParser]:
|
591
|
+
"""Get appropriate parser for a file."""
|
592
|
+
for parser in self.parsers:
|
593
|
+
if parser.can_parse(file_path):
|
594
|
+
return parser
|
595
|
+
return None
|
596
|
+
|
597
|
+
def parse_file(self, file_path: Path) -> List[ImageReference]:
|
598
|
+
"""Parse a file and return image references."""
|
599
|
+
parser = self.get_parser(file_path)
|
600
|
+
if parser:
|
601
|
+
return parser.parse(file_path)
|
602
|
+
return []
|
603
|
+
|
604
|
+
def get_supported_extensions(self) -> Set[str]:
|
605
|
+
"""Get all supported file extensions."""
|
606
|
+
extensions = set()
|
607
|
+
for parser in self.parsers:
|
608
|
+
extensions.update(parser.supported_formats)
|
609
|
+
return extensions
|