cloudflare-images-migrator 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
src/parsers.py ADDED
@@ -0,0 +1,609 @@
1
+ """
2
+ Image reference parsers for different file formats
3
+ """
4
+
5
+ import re
6
+ import json
7
+ from pathlib import Path
8
+ from typing import List, Dict, Set, Tuple, Optional
9
+ from bs4 import BeautifulSoup
10
+ from urllib.parse import urljoin, urlparse
11
+
12
+ from .utils import is_url, normalize_path, safe_read_file
13
+
14
+
15
+ class ImageReference:
16
+ """Represents an image reference found in a file."""
17
+
18
+ def __init__(self, path: str, line_number: int, column: int,
19
+ context: str, ref_type: str, original_text: str):
20
+ self.path = path # Image path or URL
21
+ self.line_number = line_number
22
+ self.column = column
23
+ self.context = context # Surrounding text for replacement
24
+ self.ref_type = ref_type # 'local', 'url', 'data'
25
+ self.original_text = original_text # Original reference text
26
+ self.is_url = is_url(path)
27
+
28
+ def __str__(self):
29
+ return f"ImageRef({self.path}, line {self.line_number}, {self.ref_type})"
30
+
31
+ def __repr__(self):
32
+ return self.__str__()
33
+
34
+
35
+ class BaseParser:
36
+ """Base class for image reference parsers."""
37
+
38
+ def __init__(self, supported_formats: List[str]):
39
+ self.supported_formats = [fmt.lower() for fmt in supported_formats]
40
+
41
+ def can_parse(self, file_path: Path) -> bool:
42
+ """Check if this parser can handle the given file."""
43
+ return file_path.suffix.lower() in self.supported_formats
44
+
45
+ def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
46
+ """Parse a file and extract image references."""
47
+ raise NotImplementedError
48
+
49
+
50
+ class HTMLParser(BaseParser):
51
+ """Parser for HTML and similar markup files."""
52
+
53
+ def __init__(self):
54
+ super().__init__(['.html', '.htm', '.xhtml', '.xml'])
55
+
56
+ def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
57
+ if content is None:
58
+ content = safe_read_file(file_path)
59
+ if content is None:
60
+ return []
61
+
62
+ references = []
63
+
64
+ try:
65
+ soup = BeautifulSoup(content, 'html.parser')
66
+
67
+ # Find img tags
68
+ for img in soup.find_all('img'):
69
+ src = img.get('src')
70
+ if src:
71
+ line_num = self._find_line_number(content, str(img))
72
+ references.append(ImageReference(
73
+ path=src,
74
+ line_number=line_num,
75
+ column=0,
76
+ context=str(img),
77
+ ref_type='url' if is_url(src) else 'local',
78
+ original_text=str(img)
79
+ ))
80
+
81
+ # Find CSS background images in style attributes
82
+ for element in soup.find_all(attrs={'style': True}):
83
+ style = element.get('style', '')
84
+ css_refs = self._parse_css_urls(style)
85
+ for ref in css_refs:
86
+ line_num = self._find_line_number(content, str(element))
87
+ references.append(ImageReference(
88
+ path=ref,
89
+ line_number=line_num,
90
+ column=0,
91
+ context=str(element),
92
+ ref_type='url' if is_url(ref) else 'local',
93
+ original_text=style
94
+ ))
95
+
96
+ # Find inline CSS
97
+ for style_tag in soup.find_all('style'):
98
+ if style_tag.string:
99
+ css_refs = self._parse_css_urls(style_tag.string)
100
+ for ref in css_refs:
101
+ line_num = self._find_line_number(content, style_tag.string)
102
+ references.append(ImageReference(
103
+ path=ref,
104
+ line_number=line_num,
105
+ column=0,
106
+ context=style_tag.string,
107
+ ref_type='url' if is_url(ref) else 'local',
108
+ original_text=style_tag.string
109
+ ))
110
+
111
+ except Exception as e:
112
+ # Fallback to regex parsing if BeautifulSoup fails
113
+ references = self._regex_parse(content)
114
+
115
+ return references
116
+
117
+ def _parse_css_urls(self, css_content: str) -> List[str]:
118
+ """Extract URLs from CSS content."""
119
+ url_pattern = r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
120
+ matches = re.findall(url_pattern, css_content, re.IGNORECASE)
121
+ return [match for match in matches if self._is_image_url(match)]
122
+
123
+ def _is_image_url(self, url: str) -> bool:
124
+ """Enhanced image URL detection - catches way more image URLs."""
125
+ if not url or len(url.strip()) == 0:
126
+ return False
127
+
128
+ url_lower = url.lower().strip()
129
+
130
+ # Traditional image extensions
131
+ image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico', '.avif', '.heic']
132
+ if any(url_lower.endswith(ext) for ext in image_extensions):
133
+ return True
134
+
135
+ # Data URLs
136
+ if url_lower.startswith('data:image'):
137
+ return True
138
+
139
+ # Common image hosting domains and services
140
+ image_domains = [
141
+ 'img.shields.io', # Badge service
142
+ 'github.com', # GitHub assets
143
+ 'githubusercontent.com', # GitHub raw content
144
+ 'imagedelivery.net', # Cloudflare Images
145
+ 'imgur.com', # Imgur
146
+ 'gravatar.com', # Gravatar
147
+ 'unsplash.com', # Unsplash
148
+ 'pexels.com', # Pexels
149
+ 'pixabay.com', # Pixabay
150
+ 'jsdelivr.net', # CDN images
151
+ 'cloudfront.net', # AWS CloudFront
152
+ 'fastly.com', # Fastly CDN
153
+ 'shopify.com', # Shopify assets
154
+ 'squarespace.com', # Squarespace assets
155
+ 'wix.com', # Wix assets
156
+ 'wordpress.com', # WordPress assets
157
+ 'medium.com', # Medium images
158
+ 'assets', # Generic assets path
159
+ ]
160
+
161
+ # Check if it's from known image hosting domains
162
+ parsed_url = urlparse(url_lower)
163
+ if parsed_url.netloc:
164
+ for domain in image_domains:
165
+ if domain in parsed_url.netloc:
166
+ return True
167
+
168
+ # GitHub asset patterns (specific patterns for GitHub)
169
+ if 'github.com' in url_lower and '/assets/' in url_lower:
170
+ return True
171
+ if 'githubusercontent.com' in url_lower:
172
+ return True
173
+
174
+ # Badge services (img.shields.io, badgen.net, etc.)
175
+ badge_indicators = ['badge', 'shield', 'logo=', 'style=', 'color=']
176
+ if any(indicator in url_lower for indicator in badge_indicators):
177
+ return True
178
+
179
+ # SVG files in URLs (often don't have .svg extension)
180
+ if 'svg' in url_lower:
181
+ return True
182
+
183
+ # Image-like paths (contain image keywords)
184
+ image_keywords = ['/images/', '/img/', '/pics/', '/photos/', '/assets/', '/media/',
185
+ '/uploads/', '/content/', '/static/', '/public/', 'icon', 'logo',
186
+ 'banner', 'avatar', 'thumbnail', 'cover', 'screenshot']
187
+ if any(keyword in url_lower for keyword in image_keywords):
188
+ return True
189
+
190
+ # URLs with image-like query parameters
191
+ if '?' in url and any(param in url_lower for param in ['width=', 'height=', 'size=', 'format=', 'quality=']):
192
+ return True
193
+
194
+ return False
195
+
196
+ def _find_line_number(self, content: str, search_text: str) -> int:
197
+ """Find line number of text in content."""
198
+ try:
199
+ index = content.find(search_text)
200
+ if index != -1:
201
+ return content[:index].count('\n') + 1
202
+ except Exception:
203
+ pass
204
+ return 1
205
+
206
+ def _regex_parse(self, content: str) -> List[ImageReference]:
207
+ """Fallback regex parsing."""
208
+ references = []
209
+
210
+ # HTML img src pattern
211
+ img_pattern = r'<img[^>]+src\s*=\s*["\']([^"\']+)["\'][^>]*>'
212
+ for match in re.finditer(img_pattern, content, re.IGNORECASE):
213
+ src = match.group(1)
214
+ if self._is_image_url(src):
215
+ line_num = content[:match.start()].count('\n') + 1
216
+ references.append(ImageReference(
217
+ path=src,
218
+ line_number=line_num,
219
+ column=match.start(),
220
+ context=match.group(0),
221
+ ref_type='url' if is_url(src) else 'local',
222
+ original_text=match.group(0)
223
+ ))
224
+
225
+ return references
226
+
227
+
228
+ class CSSParser(BaseParser):
229
+ """Parser for CSS files."""
230
+
231
+ def __init__(self):
232
+ super().__init__(['.css', '.scss', '.sass', '.less'])
233
+
234
+ def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
235
+ if content is None:
236
+ content = safe_read_file(file_path)
237
+ if content is None:
238
+ return []
239
+
240
+ references = []
241
+
242
+ # URL pattern for CSS
243
+ url_pattern = r'url\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)'
244
+
245
+ for match in re.finditer(url_pattern, content, re.IGNORECASE):
246
+ url = match.group(1)
247
+ if self._is_image_url(url):
248
+ line_num = content[:match.start()].count('\n') + 1
249
+ references.append(ImageReference(
250
+ path=url,
251
+ line_number=line_num,
252
+ column=match.start() - content.rfind('\n', 0, match.start()),
253
+ context=match.group(0),
254
+ ref_type='url' if is_url(url) else 'local',
255
+ original_text=match.group(0)
256
+ ))
257
+
258
+ return references
259
+
260
+ def _is_image_url(self, url: str) -> bool:
261
+ """Enhanced image URL detection - catches way more image URLs."""
262
+ if not url or len(url.strip()) == 0:
263
+ return False
264
+
265
+ url_lower = url.lower().strip()
266
+
267
+ # Traditional image extensions
268
+ image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico', '.avif', '.heic']
269
+ if any(url_lower.endswith(ext) for ext in image_extensions):
270
+ return True
271
+
272
+ # Data URLs
273
+ if url_lower.startswith('data:image'):
274
+ return True
275
+
276
+ # Common image hosting domains and services
277
+ image_domains = [
278
+ 'img.shields.io', # Badge service
279
+ 'github.com', # GitHub assets
280
+ 'githubusercontent.com', # GitHub raw content
281
+ 'imagedelivery.net', # Cloudflare Images
282
+ 'imgur.com', # Imgur
283
+ 'gravatar.com', # Gravatar
284
+ 'unsplash.com', # Unsplash
285
+ 'pexels.com', # Pexels
286
+ 'pixabay.com', # Pixabay
287
+ 'jsdelivr.net', # CDN images
288
+ 'cloudfront.net', # AWS CloudFront
289
+ 'fastly.com', # Fastly CDN
290
+ 'shopify.com', # Shopify assets
291
+ 'squarespace.com', # Squarespace assets
292
+ 'wix.com', # Wix assets
293
+ 'wordpress.com', # WordPress assets
294
+ 'medium.com', # Medium images
295
+ 'assets', # Generic assets path
296
+ ]
297
+
298
+ # Check if it's from known image hosting domains
299
+ parsed_url = urlparse(url_lower)
300
+ if parsed_url.netloc:
301
+ for domain in image_domains:
302
+ if domain in parsed_url.netloc:
303
+ return True
304
+
305
+ # GitHub asset patterns (specific patterns for GitHub)
306
+ if 'github.com' in url_lower and '/assets/' in url_lower:
307
+ return True
308
+ if 'githubusercontent.com' in url_lower:
309
+ return True
310
+
311
+ # Badge services (img.shields.io, badgen.net, etc.)
312
+ badge_indicators = ['badge', 'shield', 'logo=', 'style=', 'color=']
313
+ if any(indicator in url_lower for indicator in badge_indicators):
314
+ return True
315
+
316
+ # SVG files in URLs (often don't have .svg extension)
317
+ if 'svg' in url_lower:
318
+ return True
319
+
320
+ # Image-like paths (contain image keywords)
321
+ image_keywords = ['/images/', '/img/', '/pics/', '/photos/', '/assets/', '/media/',
322
+ '/uploads/', '/content/', '/static/', '/public/', 'icon', 'logo',
323
+ 'banner', 'avatar', 'thumbnail', 'cover', 'screenshot']
324
+ if any(keyword in url_lower for keyword in image_keywords):
325
+ return True
326
+
327
+ # URLs with image-like query parameters
328
+ if '?' in url and any(param in url_lower for param in ['width=', 'height=', 'size=', 'format=', 'quality=']):
329
+ return True
330
+
331
+ return False
332
+
333
+
334
+ class JavaScriptParser(BaseParser):
335
+ """Parser for JavaScript/TypeScript files."""
336
+
337
+ def __init__(self):
338
+ super().__init__(['.js', '.jsx', '.ts', '.tsx', '.mjs'])
339
+
340
+ def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
341
+ if content is None:
342
+ content = safe_read_file(file_path)
343
+ if content is None:
344
+ return []
345
+
346
+ references = []
347
+
348
+ # String literals that might contain image paths
349
+ patterns = [
350
+ # Import statements
351
+ r'import\s+.*?from\s+["\']([^"\']+\.(png|jpg|jpeg|gif|webp|svg))["\']',
352
+ # Require statements
353
+ r'require\s*\(\s*["\']([^"\']+\.(png|jpg|jpeg|gif|webp|svg))["\']',
354
+ # String literals with image extensions
355
+ r'["\']([^"\']*\.(png|jpg|jpeg|gif|webp|svg))["\']',
356
+ # JSX img src
357
+ r'<img[^>]+src\s*=\s*[{"\'][^}"\']*["\']([^"\']+)["\']',
358
+ # React Image component
359
+ r'<Image[^>]+src\s*=\s*[{"\'][^}"\']*["\']([^"\']+)["\']',
360
+ ]
361
+
362
+ for pattern in patterns:
363
+ for match in re.finditer(pattern, content, re.IGNORECASE):
364
+ # Get the image path (usually the first capture group)
365
+ img_path = match.group(1) if match.groups() else match.group(0)
366
+
367
+ if self._is_image_path(img_path):
368
+ line_num = content[:match.start()].count('\n') + 1
369
+ references.append(ImageReference(
370
+ path=img_path,
371
+ line_number=line_num,
372
+ column=match.start() - content.rfind('\n', 0, match.start()),
373
+ context=match.group(0),
374
+ ref_type='url' if is_url(img_path) else 'local',
375
+ original_text=match.group(0)
376
+ ))
377
+
378
+ return references
379
+
380
+ def _is_image_path(self, path: str) -> bool:
381
+ """Check if path points to an image."""
382
+ image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico']
383
+ path_lower = path.lower()
384
+ return any(path_lower.endswith(ext) for ext in image_extensions)
385
+
386
+
387
+ class MarkdownParser(BaseParser):
388
+ """Parser for Markdown files."""
389
+
390
+ def __init__(self):
391
+ super().__init__(['.md', '.markdown', '.mdown', '.mkd'])
392
+
393
+ def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
394
+ if content is None:
395
+ content = safe_read_file(file_path)
396
+ if content is None:
397
+ return []
398
+
399
+ references = []
400
+
401
+ # Enhanced Markdown image patterns
402
+ patterns = [
403
+ # ![alt](image.jpg) or ![alt](any-url)
404
+ r'!\[([^\]]*)\]\(([^)]+)\)',
405
+ # ![alt][ref] with [ref]: image.jpg
406
+ r'!\[([^\]]*)\]\[([^\]]+)\]',
407
+ # HTML img tags in markdown - enhanced to catch more variations
408
+ r'<img[^>]+src\s*=\s*["\']([^"\']+)["\'][^>]*>',
409
+ # img src= without img tag (common in HTML snippets)
410
+ r'src\s*=\s*["\']([^"\']+)["\']',
411
+ ]
412
+
413
+ for pattern in patterns:
414
+ for match in re.finditer(pattern, content, re.IGNORECASE | re.MULTILINE):
415
+ if pattern.startswith('!\\['):
416
+ # Standard markdown image
417
+ img_path = match.group(2)
418
+ elif pattern.startswith('<img') or pattern.startswith('src'):
419
+ # HTML img tag or src attribute
420
+ img_path = match.group(1)
421
+ else:
422
+ continue
423
+
424
+ # Enhanced image detection - much more permissive
425
+ if self._is_image_path(img_path):
426
+ line_num = content[:match.start()].count('\n') + 1
427
+ references.append(ImageReference(
428
+ path=img_path,
429
+ line_number=line_num,
430
+ column=match.start() - content.rfind('\n', 0, match.start()),
431
+ context=match.group(0),
432
+ ref_type='url' if is_url(img_path) else 'local',
433
+ original_text=match.group(0)
434
+ ))
435
+
436
+ return references
437
+
438
+ def _is_image_path(self, path: str) -> bool:
439
+ """Enhanced image detection - catches way more image URLs."""
440
+ if not path or len(path.strip()) == 0:
441
+ return False
442
+
443
+ path_lower = path.lower().strip()
444
+
445
+ # Traditional image extensions
446
+ image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico', '.avif', '.heic']
447
+ if any(path_lower.endswith(ext) for ext in image_extensions):
448
+ return True
449
+
450
+ # Data URLs
451
+ if path_lower.startswith('data:image'):
452
+ return True
453
+
454
+ # Common image hosting domains and services
455
+ image_domains = [
456
+ 'img.shields.io', # Badge service
457
+ 'github.com', # GitHub assets
458
+ 'githubusercontent.com', # GitHub raw content
459
+ 'imagedelivery.net', # Cloudflare Images
460
+ 'imgur.com', # Imgur
461
+ 'gravatar.com', # Gravatar
462
+ 'unsplash.com', # Unsplash
463
+ 'pexels.com', # Pexels
464
+ 'pixabay.com', # Pixabay
465
+ 'jsdelivr.net', # CDN images
466
+ 'cloudfront.net', # AWS CloudFront
467
+ 'fastly.com', # Fastly CDN
468
+ 'shopify.com', # Shopify assets
469
+ 'squarespace.com', # Squarespace assets
470
+ 'wix.com', # Wix assets
471
+ 'wordpress.com', # WordPress assets
472
+ 'medium.com', # Medium images
473
+ 'assets', # Generic assets path
474
+ ]
475
+
476
+ # Check if it's from known image hosting domains
477
+ parsed_url = urlparse(path_lower)
478
+ if parsed_url.netloc:
479
+ for domain in image_domains:
480
+ if domain in parsed_url.netloc:
481
+ return True
482
+
483
+ # GitHub asset patterns (specific patterns for GitHub)
484
+ if 'github.com' in path_lower and '/assets/' in path_lower:
485
+ return True
486
+ if 'githubusercontent.com' in path_lower:
487
+ return True
488
+
489
+ # Badge services (img.shields.io, badgen.net, etc.)
490
+ badge_indicators = ['badge', 'shield', 'logo=', 'style=', 'color=']
491
+ if any(indicator in path_lower for indicator in badge_indicators):
492
+ return True
493
+
494
+ # SVG files in URLs (often don't have .svg extension)
495
+ if 'svg' in path_lower:
496
+ return True
497
+
498
+ # Image-like paths (contain image keywords)
499
+ image_keywords = ['/images/', '/img/', '/pics/', '/photos/', '/assets/', '/media/',
500
+ '/uploads/', '/content/', '/static/', '/public/', 'icon', 'logo',
501
+ 'banner', 'avatar', 'thumbnail', 'cover', 'screenshot']
502
+ if any(keyword in path_lower for keyword in image_keywords):
503
+ return True
504
+
505
+ # URLs with image-like query parameters
506
+ if '?' in path and any(param in path_lower for param in ['width=', 'height=', 'size=', 'format=', 'quality=']):
507
+ return True
508
+
509
+ return False
510
+
511
+
512
+ class JSONParser(BaseParser):
513
+ """Parser for JSON configuration files."""
514
+
515
+ def __init__(self):
516
+ super().__init__(['.json'])
517
+
518
+ def parse(self, file_path: Path, content: str = None) -> List[ImageReference]:
519
+ if content is None:
520
+ content = safe_read_file(file_path)
521
+ if content is None:
522
+ return []
523
+
524
+ references = []
525
+
526
+ try:
527
+ # Parse JSON and look for image references
528
+ data = json.loads(content)
529
+ self._extract_from_json(data, content, references)
530
+ except json.JSONDecodeError:
531
+ # Fallback to regex parsing
532
+ pattern = r'["\']([^"\']*\.(png|jpg|jpeg|gif|webp|svg))["\']'
533
+ for match in re.finditer(pattern, content, re.IGNORECASE):
534
+ img_path = match.group(1)
535
+ line_num = content[:match.start()].count('\n') + 1
536
+ references.append(ImageReference(
537
+ path=img_path,
538
+ line_number=line_num,
539
+ column=match.start() - content.rfind('\n', 0, match.start()),
540
+ context=match.group(0),
541
+ ref_type='url' if is_url(img_path) else 'local',
542
+ original_text=match.group(0)
543
+ ))
544
+
545
+ return references
546
+
547
+ def _extract_from_json(self, data, content: str, references: List[ImageReference], path: str = ""):
548
+ """Recursively extract image references from JSON data."""
549
+ if isinstance(data, dict):
550
+ for key, value in data.items():
551
+ self._extract_from_json(value, content, references, f"{path}.{key}")
552
+ elif isinstance(data, list):
553
+ for i, item in enumerate(data):
554
+ self._extract_from_json(item, content, references, f"{path}[{i}]")
555
+ elif isinstance(data, str) and self._is_image_path(data):
556
+ # Find the string in the content to get line number
557
+ escaped_data = re.escape(data)
558
+ pattern = f'["\']({escaped_data})["\']'
559
+ match = re.search(pattern, content)
560
+ if match:
561
+ line_num = content[:match.start()].count('\n') + 1
562
+ references.append(ImageReference(
563
+ path=data,
564
+ line_number=line_num,
565
+ column=match.start() - content.rfind('\n', 0, match.start()),
566
+ context=match.group(0),
567
+ ref_type='url' if is_url(data) else 'local',
568
+ original_text=match.group(0)
569
+ ))
570
+
571
+ def _is_image_path(self, path: str) -> bool:
572
+ """Check if path points to an image."""
573
+ image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico']
574
+ path_lower = path.lower()
575
+ return any(path_lower.endswith(ext) for ext in image_extensions)
576
+
577
+
578
+ class ParserRegistry:
579
+ """Registry for all image reference parsers."""
580
+
581
+ def __init__(self):
582
+ self.parsers = [
583
+ HTMLParser(),
584
+ CSSParser(),
585
+ JavaScriptParser(),
586
+ MarkdownParser(),
587
+ JSONParser(),
588
+ ]
589
+
590
+ def get_parser(self, file_path: Path) -> Optional[BaseParser]:
591
+ """Get appropriate parser for a file."""
592
+ for parser in self.parsers:
593
+ if parser.can_parse(file_path):
594
+ return parser
595
+ return None
596
+
597
+ def parse_file(self, file_path: Path) -> List[ImageReference]:
598
+ """Parse a file and return image references."""
599
+ parser = self.get_parser(file_path)
600
+ if parser:
601
+ return parser.parse(file_path)
602
+ return []
603
+
604
+ def get_supported_extensions(self) -> Set[str]:
605
+ """Get all supported file extensions."""
606
+ extensions = set()
607
+ for parser in self.parsers:
608
+ extensions.update(parser.supported_formats)
609
+ return extensions