klydo-mcp 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,759 @@
1
+ """
2
+ Myntra scraper implementation.
3
+
4
+ Myntra is one of India's largest fashion e-commerce platforms.
5
+ This scraper uses their internal API endpoints to fetch product data.
6
+
7
+ Note: This is for educational/personal use. Respect robots.txt
8
+ and rate limits when using this scraper.
9
+ """
10
+
11
+ import json
12
+ import re
13
+ from decimal import Decimal
14
+ from urllib.parse import quote_plus
15
+
16
+ import httpx
17
+ from selectolax.parser import HTMLParser
18
+
19
+ from klydo.config import settings
20
+ from klydo.models.product import Price, Product, ProductImage, ProductSummary
21
+ from klydo.scrapers.cache import Cache
22
+
23
+
24
+ def _extract_json_object(text: str, start_marker: str) -> dict | None:
25
+ """
26
+ Extract a complete JSON object from text starting at marker.
27
+
28
+ Uses brace counting to find matching closing brace,
29
+ handling nested objects and strings correctly.
30
+
31
+ Args:
32
+ text: Full text to search
33
+ start_marker: String that precedes the JSON object
34
+
35
+ Returns:
36
+ Parsed JSON dict, or None if extraction fails
37
+ """
38
+ start = text.find(start_marker)
39
+ if start == -1:
40
+ return None
41
+
42
+ # Find the start of the JSON object
43
+ json_start = text.find("{", start)
44
+ if json_start == -1:
45
+ return None
46
+
47
+ # Count braces to find the matching closing brace
48
+ depth = 0
49
+ in_string = False
50
+ escape = False
51
+
52
+ for i, char in enumerate(text[json_start:]):
53
+ if escape:
54
+ escape = False
55
+ continue
56
+ if char == "\\" and in_string:
57
+ escape = True
58
+ continue
59
+ if char == '"' and not escape:
60
+ in_string = not in_string
61
+ continue
62
+ if in_string:
63
+ continue
64
+ if char == "{":
65
+ depth += 1
66
+ elif char == "}":
67
+ depth -= 1
68
+ if depth == 0:
69
+ json_str = text[json_start : json_start + i + 1]
70
+ try:
71
+ return json.loads(json_str)
72
+ except json.JSONDecodeError:
73
+ return None
74
+
75
+ return None
76
+
77
+
78
+ class MyntraScraper:
79
+ """
80
+ Scraper for myntra.com.
81
+
82
+ Uses Myntra's internal search API and product pages
83
+ to fetch fashion product data.
84
+
85
+ Attributes:
86
+ source_name: Returns 'Myntra'
87
+ """
88
+
89
+ BASE_URL = "https://www.myntra.com"
90
+
91
+ def __init__(self) -> None:
92
+ """Initialize scraper with HTTP client and cache."""
93
+ self._client = httpx.AsyncClient(
94
+ headers=self._get_headers(),
95
+ timeout=settings.request_timeout,
96
+ follow_redirects=True,
97
+ )
98
+ self._cache = Cache(namespace="myntra", default_ttl=settings.cache_ttl)
99
+
100
+ @property
101
+ def source_name(self) -> str:
102
+ """Human-readable source name."""
103
+ return "Myntra"
104
+
105
+ def _get_headers(self) -> dict[str, str]:
106
+ """
107
+ Get browser-like headers to avoid blocks.
108
+
109
+ Returns:
110
+ Headers dict for HTTP requests
111
+ """
112
+ return {
113
+ "User-Agent": (
114
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
115
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
116
+ "Chrome/120.0.0.0 Safari/537.36"
117
+ ),
118
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
119
+ "Accept-Language": "en-US,en;q=0.9",
120
+ "Accept-Encoding": "gzip, deflate, br",
121
+ "Connection": "keep-alive",
122
+ "Upgrade-Insecure-Requests": "1",
123
+ }
124
+
125
+ async def search(
126
+ self,
127
+ query: str,
128
+ *,
129
+ category: str | None = None,
130
+ gender: str | None = None,
131
+ min_price: int | None = None,
132
+ max_price: int | None = None,
133
+ limit: int = 20,
134
+ ) -> list[ProductSummary]:
135
+ """
136
+ Search Myntra products.
137
+
138
+ Args:
139
+ query: Search terms
140
+ category: Optional category filter
141
+ gender: Optional gender filter (men/women)
142
+ min_price: Minimum price in INR
143
+ max_price: Maximum price in INR
144
+ limit: Max results to return
145
+
146
+ Returns:
147
+ List of matching products
148
+ """
149
+ # Build cache key
150
+ cache_key = self._cache.cache_key(
151
+ "search",
152
+ query,
153
+ category or "",
154
+ gender or "",
155
+ str(min_price or ""),
156
+ str(max_price or ""),
157
+ )
158
+
159
+ # Check cache
160
+ cached = await self._cache.get(cache_key)
161
+ if cached:
162
+ products = [ProductSummary.model_validate(p) for p in cached]
163
+ return products[:limit]
164
+
165
+ # Build search URL
166
+ search_path = self._build_search_path(query, gender, category)
167
+ url = f"{self.BASE_URL}/{search_path}"
168
+
169
+ try:
170
+ response = await self._client.get(url)
171
+ response.raise_for_status()
172
+
173
+ products = self._parse_search_results(response.text, limit)
174
+
175
+ # Apply price filters
176
+ if min_price is not None:
177
+ products = [p for p in products if p.price.current >= min_price]
178
+ if max_price is not None:
179
+ products = [p for p in products if p.price.current <= max_price]
180
+
181
+ # Cache results
182
+ await self._cache.set(
183
+ cache_key,
184
+ [p.model_dump(mode="json") for p in products],
185
+ )
186
+
187
+ return products[:limit]
188
+
189
+ except httpx.HTTPError as e:
190
+ # Log error in debug mode
191
+ if settings.debug:
192
+ print(f"Search error: {e}")
193
+ return []
194
+
195
+ def _build_search_path(
196
+ self,
197
+ query: str,
198
+ gender: str | None = None,
199
+ category: str | None = None,
200
+ ) -> str:
201
+ """
202
+ Build Myntra search URL path.
203
+
204
+ Args:
205
+ query: Search query
206
+ gender: Gender filter
207
+ category: Category filter
208
+
209
+ Returns:
210
+ URL path for search
211
+ """
212
+ # Myntra uses URL-based filtering
213
+ parts = []
214
+
215
+ if gender:
216
+ parts.append(gender.lower())
217
+
218
+ if category:
219
+ parts.append(category.lower().replace(" ", "-"))
220
+
221
+ # Add query
222
+ encoded_query = quote_plus(query.lower().replace(" ", "-"))
223
+ parts.append(encoded_query)
224
+
225
+ return "-".join(parts) if parts else encoded_query
226
+
227
+ def _parse_search_results(
228
+ self,
229
+ html: str,
230
+ limit: int,
231
+ ) -> list[ProductSummary]:
232
+ """
233
+ Parse search results from HTML.
234
+
235
+ Myntra embeds product data in a script tag as JSON.
236
+
237
+ Args:
238
+ html: Raw HTML response
239
+ limit: Max products to parse
240
+
241
+ Returns:
242
+ List of parsed products
243
+ """
244
+ products: list[ProductSummary] = []
245
+
246
+ # Try to extract JSON data from script tag
247
+ # Myntra stores product data in window.__myx
248
+ data = _extract_json_object(html, "window.__myx")
249
+
250
+ if data:
251
+ try:
252
+ search_data = data.get("searchData", {})
253
+ results = search_data.get("results", {})
254
+ product_list = results.get("products", [])
255
+
256
+ for item in product_list[:limit]:
257
+ product = self._parse_product_from_json(item)
258
+ if product:
259
+ products.append(product)
260
+
261
+ if products:
262
+ return products
263
+
264
+ except (KeyError, TypeError):
265
+ pass
266
+
267
+ # Fallback: parse HTML directly
268
+ parser = HTMLParser(html)
269
+
270
+ for item in parser.css("li.product-base")[:limit]:
271
+ product = self._parse_product_from_html(item)
272
+ if product:
273
+ products.append(product)
274
+
275
+ return products
276
+
277
+ def _parse_product_from_json(self, data: dict) -> ProductSummary | None:
278
+ """
279
+ Parse product from Myntra JSON data.
280
+
281
+ Args:
282
+ data: Product JSON from Myntra
283
+
284
+ Returns:
285
+ ProductSummary or None if parsing fails
286
+ """
287
+ try:
288
+ product_id = str(data.get("productId", ""))
289
+ if not product_id:
290
+ return None
291
+
292
+ # Extract price
293
+ price_data = data.get("price", data.get("mrp", 0))
294
+ mrp = data.get("mrp", price_data)
295
+ _ = data.get("discount", 0) # noqa: F841
296
+
297
+ # Handle price as int or dict
298
+ if isinstance(price_data, dict):
299
+ current_price = Decimal(
300
+ str(price_data.get("discounted", price_data.get("mrp", 0)))
301
+ )
302
+ original_price = Decimal(str(price_data.get("mrp", current_price)))
303
+ else:
304
+ current_price = Decimal(str(price_data))
305
+ original_price = Decimal(str(mrp)) if mrp != price_data else None
306
+
307
+ # Calculate discount percent from prices (discount field is amount, not percent)
308
+ discount_percent = None
309
+ if original_price and original_price > current_price:
310
+ discount_percent = int(
311
+ ((original_price - current_price) / original_price) * 100
312
+ )
313
+
314
+ # Get image URL
315
+ images = data.get("images", [])
316
+ image_url = ""
317
+ if images:
318
+ img = images[0] if isinstance(images[0], dict) else {"src": images[0]}
319
+ image_url = img.get("src", "")
320
+ if not image_url.startswith("http"):
321
+ image_url = (
322
+ f"https://assets.myntassets.com/h_720,q_90,w_540/{image_url}"
323
+ )
324
+
325
+ # Fallback to search image
326
+ if not image_url:
327
+ image_url = data.get("searchImage", "")
328
+ if image_url and not image_url.startswith("http"):
329
+ image_url = (
330
+ f"https://assets.myntassets.com/h_720,q_90,w_540/{image_url}"
331
+ )
332
+
333
+ if not image_url:
334
+ return None
335
+
336
+ # Extract category - can be string or dict with typeName
337
+ article_type = data.get("articleType", data.get("category", "Fashion"))
338
+ if isinstance(article_type, dict):
339
+ category = article_type.get("typeName", "Fashion")
340
+ else:
341
+ category = str(article_type) if article_type else "Fashion"
342
+
343
+ return ProductSummary(
344
+ id=product_id,
345
+ name=data.get("productName", data.get("product", "Unknown")),
346
+ brand=data.get("brand", "Unknown"),
347
+ price=Price(
348
+ current=current_price,
349
+ original=original_price,
350
+ currency="INR",
351
+ discount_percent=discount_percent,
352
+ ),
353
+ image_url=image_url,
354
+ category=category,
355
+ source=self.source_name,
356
+ url=f"{self.BASE_URL}/{product_id}",
357
+ )
358
+
359
+ except (KeyError, ValueError, TypeError) as e:
360
+ if settings.debug:
361
+ print(f"Parse error: {e}")
362
+ return None
363
+
364
+ def _parse_product_from_html(self, element) -> ProductSummary | None:
365
+ """
366
+ Parse product from HTML element (fallback).
367
+
368
+ Args:
369
+ element: selectolax Node
370
+
371
+ Returns:
372
+ ProductSummary or None if parsing fails
373
+ """
374
+ try:
375
+ # Get product link
376
+ link = element.css_first("a")
377
+ if not link:
378
+ return None
379
+
380
+ href = link.attributes.get("href", "")
381
+ product_id = href.split("/")[-1] if href else ""
382
+
383
+ if not product_id:
384
+ return None
385
+
386
+ # Get image
387
+ img = element.css_first("img")
388
+ image_url = ""
389
+ if img:
390
+ image_url = img.attributes.get(
391
+ "src", img.attributes.get("data-src", "")
392
+ )
393
+
394
+ if not image_url:
395
+ return None
396
+
397
+ # Get brand
398
+ brand_elem = element.css_first(".product-brand")
399
+ brand = brand_elem.text().strip() if brand_elem else "Unknown"
400
+
401
+ # Get name
402
+ name_elem = element.css_first(".product-product")
403
+ name = name_elem.text().strip() if name_elem else "Unknown Product"
404
+
405
+ # Get price
406
+ price_elem = element.css_first(".product-discountedPrice")
407
+ if not price_elem:
408
+ price_elem = element.css_first(".product-price")
409
+
410
+ current_price = Decimal("0")
411
+ if price_elem:
412
+ price_text = price_elem.text().strip()
413
+ # Extract numbers from price text (e.g., "Rs. 1,299" -> 1299)
414
+ price_match = re.search(r"[\d,]+", price_text)
415
+ if price_match:
416
+ current_price = Decimal(price_match.group().replace(",", ""))
417
+
418
+ # Get original price
419
+ original_elem = element.css_first(".product-strike")
420
+ original_price = None
421
+ if original_elem:
422
+ original_text = original_elem.text().strip()
423
+ price_match = re.search(r"[\d,]+", original_text)
424
+ if price_match:
425
+ original_price = Decimal(price_match.group().replace(",", ""))
426
+
427
+ # Get discount
428
+ discount_percent = None
429
+ discount_elem = element.css_first(".product-discountPercentage")
430
+ if discount_elem:
431
+ discount_text = discount_elem.text().strip()
432
+ discount_match = re.search(r"(\d+)%", discount_text)
433
+ if discount_match:
434
+ discount_percent = int(discount_match.group(1))
435
+
436
+ return ProductSummary(
437
+ id=product_id,
438
+ name=name,
439
+ brand=brand,
440
+ price=Price(
441
+ current=current_price,
442
+ original=original_price,
443
+ currency="INR",
444
+ discount_percent=discount_percent,
445
+ ),
446
+ image_url=image_url,
447
+ category="Fashion",
448
+ source=self.source_name,
449
+ url=f"{self.BASE_URL}/{href.lstrip('/')}"
450
+ if href
451
+ else f"{self.BASE_URL}/{product_id}",
452
+ )
453
+
454
+ except (AttributeError, ValueError) as e:
455
+ if settings.debug:
456
+ print(f"HTML parse error: {e}")
457
+ return None
458
+
459
+ async def get_product(self, product_id: str) -> Product | None:
460
+ """
461
+ Get full product details by ID.
462
+
463
+ Args:
464
+ product_id: Myntra product ID
465
+
466
+ Returns:
467
+ Full product details, or None if not found
468
+ """
469
+ cache_key = self._cache.cache_key("product", product_id)
470
+
471
+ # Check cache
472
+ cached = await self._cache.get(cache_key)
473
+ if cached:
474
+ return Product.model_validate(cached)
475
+
476
+ url = f"{self.BASE_URL}/{product_id}"
477
+
478
+ try:
479
+ response = await self._client.get(url)
480
+ response.raise_for_status()
481
+
482
+ product = self._parse_product_page(response.text, product_id)
483
+
484
+ if product:
485
+ await self._cache.set(cache_key, product.model_dump(mode="json"))
486
+
487
+ return product
488
+
489
+ except httpx.HTTPError as e:
490
+ if settings.debug:
491
+ print(f"Product fetch error: {e}")
492
+ return None
493
+
494
+ def _parse_product_page(self, html: str, product_id: str) -> Product | None:
495
+ """
496
+ Parse full product details from product page.
497
+
498
+ Args:
499
+ html: Raw HTML of product page
500
+ product_id: Product ID
501
+
502
+ Returns:
503
+ Full Product or None if parsing fails
504
+ """
505
+ # Try to extract JSON data
506
+ data = _extract_json_object(html, "window.__myx")
507
+
508
+ if data:
509
+ try:
510
+ pdp_data = data.get("pdpData", {})
511
+
512
+ if pdp_data:
513
+ return self._parse_full_product_from_json(pdp_data, product_id)
514
+
515
+ except (KeyError, TypeError):
516
+ pass
517
+
518
+ # Fallback: basic HTML parsing
519
+ parser = HTMLParser(html)
520
+
521
+ try:
522
+ # Get basic info
523
+ name_elem = parser.css_first("h1.pdp-title")
524
+ brand_elem = parser.css_first("h1.pdp-name")
525
+
526
+ name = name_elem.text().strip() if name_elem else "Unknown Product"
527
+ brand = brand_elem.text().strip() if brand_elem else "Unknown"
528
+
529
+ # Get price
530
+ price_elem = parser.css_first(".pdp-price strong")
531
+ current_price = Decimal("0")
532
+ if price_elem:
533
+ price_text = price_elem.text().strip()
534
+ price_match = re.search(r"[\d,]+", price_text)
535
+ if price_match:
536
+ current_price = Decimal(price_match.group().replace(",", ""))
537
+
538
+ # Get images
539
+ images = []
540
+ for img in parser.css(".image-grid-image"):
541
+ style = img.attributes.get("style", "")
542
+ url_match = re.search(r'url\(["\']?([^"\']+)["\']?\)', style)
543
+ if url_match:
544
+ img_url = url_match.group(1)
545
+ if not img_url.startswith("http"):
546
+ img_url = f"https:{img_url}"
547
+ images.append(ProductImage(url=img_url, alt=name))
548
+
549
+ # Get description
550
+ desc_elem = parser.css_first(".pdp-productDescriptorsContainer")
551
+ description = desc_elem.text().strip() if desc_elem else ""
552
+
553
+ # Get sizes
554
+ sizes = []
555
+ for size_btn in parser.css(".size-buttons-buttonContainer button"):
556
+ size_text = size_btn.text().strip()
557
+ if size_text:
558
+ sizes.append(size_text)
559
+
560
+ return Product(
561
+ id=product_id,
562
+ name=name,
563
+ brand=brand,
564
+ price=Price(current=current_price, currency="INR"),
565
+ image_url=images[0].url
566
+ if images
567
+ else f"{self.BASE_URL}/placeholder.jpg",
568
+ category="Fashion",
569
+ source=self.source_name,
570
+ url=f"{self.BASE_URL}/{product_id}",
571
+ description=description,
572
+ images=images,
573
+ sizes=sizes,
574
+ colors=[],
575
+ rating=None,
576
+ review_count=0,
577
+ in_stock=True,
578
+ specifications={},
579
+ )
580
+
581
+ except (AttributeError, ValueError) as e:
582
+ if settings.debug:
583
+ print(f"Product page parse error: {e}")
584
+ return None
585
+
586
+ def _parse_full_product_from_json(
587
+ self,
588
+ data: dict,
589
+ product_id: str,
590
+ ) -> Product | None:
591
+ """
592
+ Parse full product from Myntra PDP JSON data.
593
+
594
+ Args:
595
+ data: PDP JSON data
596
+ product_id: Product ID
597
+
598
+ Returns:
599
+ Full Product or None if parsing fails
600
+ """
601
+ try:
602
+ # Get price info
603
+ price_data = data.get("price", {})
604
+ mrp = price_data.get("mrp", 0)
605
+ discounted = price_data.get("discounted", mrp)
606
+ discount = price_data.get("discount", 0)
607
+
608
+ # Get images
609
+ images = []
610
+ media = data.get("media", {})
611
+ albums = media.get("albums", [])
612
+
613
+ for album in albums:
614
+ for img in album.get("images", []):
615
+ img_url = img.get("imageURL", "")
616
+ if img_url:
617
+ if not img_url.startswith("http"):
618
+ img_url = f"https://assets.myntassets.com/h_720,q_90,w_540/{img_url}"
619
+ images.append(
620
+ ProductImage(
621
+ url=img_url,
622
+ alt=data.get("name", ""),
623
+ )
624
+ )
625
+
626
+ # Get sizes
627
+ sizes = []
628
+ size_data = data.get("sizes", [])
629
+ for size in size_data:
630
+ size_label = size.get("label", "")
631
+ if size_label and size.get("available", True):
632
+ sizes.append(size_label)
633
+
634
+ # Get colors from style options
635
+ colors = []
636
+ style_options = data.get("styleOptions", [])
637
+ for opt in style_options:
638
+ color = opt.get("color", "")
639
+ if color and color not in colors:
640
+ colors.append(color)
641
+
642
+ # Get ratings
643
+ ratings = data.get("ratings", {})
644
+ rating = ratings.get("averageRating")
645
+ review_count = ratings.get("totalCount", 0)
646
+
647
+ # Get specifications
648
+ specs = {}
649
+ product_details = data.get("productDetails", [])
650
+ for detail in product_details:
651
+ title = detail.get("title", "")
652
+ desc = detail.get("description", "")
653
+ if title and desc:
654
+ specs[title] = desc
655
+
656
+ # Check stock
657
+ in_stock = data.get("isAvailable", True)
658
+
659
+ # Get description
660
+ description = data.get("productDescription", "")
661
+ if not description:
662
+ desc_list = data.get("articleAttributes", {})
663
+ description = ", ".join(f"{k}: {v}" for k, v in desc_list.items() if v)
664
+
665
+ primary_image = ""
666
+ if images:
667
+ primary_image = str(images[0].url)
668
+ elif data.get("searchImage"):
669
+ primary_image = data["searchImage"]
670
+ if not primary_image.startswith("http"):
671
+ primary_image = f"https://assets.myntassets.com/h_720,q_90,w_540/{primary_image}"
672
+
673
+ if not primary_image:
674
+ return None
675
+
676
+ return Product(
677
+ id=product_id,
678
+ name=data.get("name", "Unknown Product"),
679
+ brand=data.get("brand", {}).get("name", "Unknown"),
680
+ price=Price(
681
+ current=Decimal(str(discounted)),
682
+ original=Decimal(str(mrp)) if mrp != discounted else None,
683
+ currency="INR",
684
+ discount_percent=int(discount) if discount else None,
685
+ ),
686
+ image_url=primary_image,
687
+ category=data.get("articleType", {}).get("typeName", "Fashion"),
688
+ source=self.source_name,
689
+ url=f"{self.BASE_URL}/{product_id}",
690
+ description=description,
691
+ images=images,
692
+ sizes=sizes,
693
+ colors=colors,
694
+ rating=float(rating) if rating else None,
695
+ review_count=review_count,
696
+ in_stock=in_stock,
697
+ specifications=specs,
698
+ )
699
+
700
+ except (KeyError, ValueError, TypeError) as e:
701
+ if settings.debug:
702
+ print(f"Full product parse error: {e}")
703
+ return None
704
+
705
+ async def get_trending(
706
+ self,
707
+ category: str | None = None,
708
+ limit: int = 20,
709
+ ) -> list[ProductSummary]:
710
+ """
711
+ Get trending/popular products.
712
+
713
+ Uses Myntra's trending/popular section.
714
+
715
+ Args:
716
+ category: Optional category filter
717
+ limit: Max results to return
718
+
719
+ Returns:
720
+ List of trending products
721
+ """
722
+ cache_key = self._cache.cache_key("trending", category or "all")
723
+
724
+ # Check cache
725
+ cached = await self._cache.get(cache_key)
726
+ if cached:
727
+ products = [ProductSummary.model_validate(p) for p in cached]
728
+ return products[:limit]
729
+
730
+ # Build URL for trending page
731
+ if category:
732
+ url = (
733
+ f"{self.BASE_URL}/{category.lower().replace(' ', '-')}?sort=popularity"
734
+ )
735
+ else:
736
+ url = f"{self.BASE_URL}/clothing?sort=popularity"
737
+
738
+ try:
739
+ response = await self._client.get(url)
740
+ response.raise_for_status()
741
+
742
+ products = self._parse_search_results(response.text, limit)
743
+
744
+ # Cache results
745
+ await self._cache.set(
746
+ cache_key,
747
+ [p.model_dump(mode="json") for p in products],
748
+ )
749
+
750
+ return products
751
+
752
+ except httpx.HTTPError as e:
753
+ if settings.debug:
754
+ print(f"Trending fetch error: {e}")
755
+ return []
756
+
757
+ async def close(self) -> None:
758
+ """Clean up HTTP client."""
759
+ await self._client.aclose()