phoenix-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. phoenix/__init__.py +41 -0
  2. phoenix/__main__.py +8 -0
  3. phoenix/adapters/__init__.py +25 -0
  4. phoenix/adapters/base.py +230 -0
  5. phoenix/adapters/facebook.py +482 -0
  6. phoenix/adapters/generated/__init__.py +0 -0
  7. phoenix/adapters/generated/quotes_to_scrape.py +76 -0
  8. phoenix/adapters/generic.py +189 -0
  9. phoenix/adapters/instagram.py +426 -0
  10. phoenix/adapters/linkedin.py +542 -0
  11. phoenix/adapters/tiktok.py +557 -0
  12. phoenix/adapters/x_twitter.py +401 -0
  13. phoenix/adapters/youtube.py +544 -0
  14. phoenix/architect/__init__.py +15 -0
  15. phoenix/architect/coder.py +150 -0
  16. phoenix/architect/critic.py +324 -0
  17. phoenix/architect/explorer.py +232 -0
  18. phoenix/architect/fixture_generator.py +256 -0
  19. phoenix/architect/inspector.py +111 -0
  20. phoenix/architect/orchestrator.py +403 -0
  21. phoenix/architect/researcher.py +187 -0
  22. phoenix/architect/template_generator.py +145 -0
  23. phoenix/architect/writer.py +108 -0
  24. phoenix/cli/__init__.py +7 -0
  25. phoenix/cli/main.py +725 -0
  26. phoenix/collectors/__init__.py +17 -0
  27. phoenix/collectors/base.py +81 -0
  28. phoenix/collectors/browser.py +209 -0
  29. phoenix/collectors/browser_pool.py +197 -0
  30. phoenix/collectors/direct.py +132 -0
  31. phoenix/engine.py +257 -0
  32. phoenix/exceptions.py +77 -0
  33. phoenix/infrastructure/__init__.py +40 -0
  34. phoenix/infrastructure/audit_logger.py +68 -0
  35. phoenix/infrastructure/config.py +134 -0
  36. phoenix/infrastructure/license_manager.py +270 -0
  37. phoenix/infrastructure/rate_limiter.py +197 -0
  38. phoenix/infrastructure/session_manager.py +92 -0
  39. phoenix/infrastructure/storage.py +580 -0
  40. phoenix/infrastructure/vault.py +275 -0
  41. phoenix/intelligence/__init__.py +18 -0
  42. phoenix/intelligence/anti_bot_recovery.py +205 -0
  43. phoenix/intelligence/change_detector.py +314 -0
  44. phoenix/intelligence/classifier.py +179 -0
  45. phoenix/intelligence/entities.py +104 -0
  46. phoenix/intelligence/selector_health.py +139 -0
  47. phoenix/intelligence/selector_repair.py +35 -0
  48. phoenix/models/__init__.py +34 -0
  49. phoenix/models/classification.py +19 -0
  50. phoenix/models/config.py +207 -0
  51. phoenix/models/document.py +70 -0
  52. phoenix/models/output.py +182 -0
  53. phoenix/models/session.py +26 -0
  54. phoenix/models/strategy.py +28 -0
  55. phoenix/options.py +67 -0
  56. phoenix/pipeline.py +598 -0
  57. phoenix/plugins/__init__.py +9 -0
  58. phoenix/plugins/loader.py +266 -0
  59. phoenix/plugins/manifest.py +62 -0
  60. phoenix/plugins/registry.py +109 -0
  61. phoenix/processing/__init__.py +15 -0
  62. phoenix/processing/ai_assistant.py +101 -0
  63. phoenix/processing/archiver.py +124 -0
  64. phoenix/processing/domain_memory.py +304 -0
  65. phoenix/processing/html_extractor.py +79 -0
  66. phoenix/processing/normalizer.py +124 -0
  67. phoenix/processing/phoenix_ai_extractor.py +436 -0
  68. phoenix/py.typed +0 -0
  69. phoenix/router.py +304 -0
  70. phoenix/scrapers/__init__.py +33 -0
  71. phoenix/scrapers/base.py +13 -0
  72. phoenix/scrapers/browser.py +9 -0
  73. phoenix/scrapers/http.py +9 -0
  74. phoenix/scrapers/selector_engine.py +38 -0
  75. phoenix/stealth/__init__.py +21 -0
  76. phoenix/stealth/captcha.py +143 -0
  77. phoenix/stealth/humanizer.py +101 -0
  78. phoenix/stealth/profile.py +134 -0
  79. phoenix/stealth/rotator.py +87 -0
  80. phoenix/stealth/warming.py +56 -0
  81. phoenix/strategy_selector.py +145 -0
  82. phoenix/version.py +7 -0
  83. phoenix_engine-0.1.0.dist-info/METADATA +187 -0
  84. phoenix_engine-0.1.0.dist-info/RECORD +87 -0
  85. phoenix_engine-0.1.0.dist-info/WHEEL +5 -0
  86. phoenix_engine-0.1.0.dist-info/entry_points.txt +2 -0
  87. phoenix_engine-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,482 @@
1
+ """Facebook platform adapter for Phoenix Engine.
2
+
3
+ Extracts structured data from public Facebook pages and posts using CSS
4
+ selector fallback chains. No official Facebook API is used.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from datetime import UTC, datetime
11
+ from typing import TYPE_CHECKING, Any, ClassVar
12
+ from urllib.parse import urljoin, urlparse
13
+
14
+ from bs4 import BeautifulSoup
15
+
16
+ from phoenix.adapters.base import BaseAdapter
17
+ from phoenix.models.output import UnifiedOutput
18
+ from phoenix.plugins.manifest import PluginManifest
19
+
20
+ if TYPE_CHECKING:
21
+ from phoenix.collectors.base import Collector
22
+ from phoenix.models.document import RawResponse
23
+ from phoenix.options import CollectionOptions
24
+
25
+
26
+ class FacebookAdapter(BaseAdapter):
27
+ """Adapter for scraping public Facebook pages and posts."""
28
+
29
+ _URL_PATTERNS: ClassVar[list[str]] = [
30
+ r"https?://(?:www\.)?facebook\.com/[^/]+/posts/[^/]+",
31
+ r"https?://(?:www\.)?facebook\.com/[^/]+/photos/[^/]+",
32
+ r"https?://(?:www\.)?facebook\.com/[^/]+/videos/[^/]+",
33
+ r"https?://(?:www\.)?facebook\.com/[^/]+/reel/[^/]+",
34
+ r"https?://(?:www\.)?facebook\.com/pages/[^/]+",
35
+ r"https?://(?:www\.)?facebook\.com/[^/]+",
36
+ ]
37
+
38
+ _PAGE_SELECTOR_SETS: ClassVar[dict[str, list[str]]] = {
39
+ "name": [
40
+ "h1.fb-page-name",
41
+ ".fb-page-name",
42
+ "[data-page-id] h1",
43
+ ],
44
+ "category": [
45
+ ".fb-page-category",
46
+ "[data-page-category]",
47
+ ],
48
+ "followers_count": [
49
+ ".fb-followers-count",
50
+ '.fb-page-counts [data-field="followers"]',
51
+ ],
52
+ "likes_count": [
53
+ ".fb-likes-count",
54
+ '.fb-page-counts [data-field="likes"]',
55
+ ],
56
+ "description": [
57
+ ".fb-page-description",
58
+ ".fb-page-about p",
59
+ ],
60
+ "website": [
61
+ ".fb-page-website",
62
+ ".fb-page-about a[href]",
63
+ ],
64
+ }
65
+
66
+ _POST_SELECTOR_SETS: ClassVar[dict[str, list[str]]] = {
67
+ "author": [
68
+ ".fb-author.author-name",
69
+ ".fb-author",
70
+ "[data-author-name]",
71
+ ],
72
+ "text": [
73
+ ".fb-user-content.post-caption p",
74
+ ".fb-user-content",
75
+ ".post-caption",
76
+ ],
77
+ "reaction_count": [
78
+ ".fb-reaction-count.like-count",
79
+ ".fb-reaction-count",
80
+ ".like-count",
81
+ ],
82
+ "comment_count": [
83
+ ".fb-comment-count.comment-count",
84
+ ".fb-comment-count",
85
+ ".comment-count",
86
+ ],
87
+ "share_count": [
88
+ ".fb-share-count.share-count",
89
+ ".fb-share-count",
90
+ ".share-count",
91
+ ],
92
+ }
93
+
94
+ _PRIVATE_INDICATORS: ClassVar[list[str]] = [
95
+ "log in to",
96
+ "login to",
97
+ "sign in to",
98
+ "signin to",
99
+ "this content isn't available",
100
+ "this page isn't available",
101
+ "page not found",
102
+ "sorry, this page isn't available",
103
+ "please log in",
104
+ "please sign in",
105
+ "you must log in",
106
+ "you must sign in",
107
+ "members only",
108
+ "private group",
109
+ "this account is private",
110
+ "this profile is private",
111
+ "friends only",
112
+ "only friends",
113
+ "only available to friends",
114
+ "this content isn't available right now",
115
+ ]
116
+
117
+ @property
118
+ def manifest(self) -> PluginManifest:
119
+ """Return the Facebook adapter manifest."""
120
+ return PluginManifest(
121
+ name="facebook",
122
+ version="0.1.0",
123
+ description="Scraper for public Facebook pages and posts.",
124
+ author="Phoenix Engine Team",
125
+ platforms=["facebook"],
126
+ url_patterns=list(self._URL_PATTERNS),
127
+ strategies=["browser", "http"],
128
+ requires_auth=True,
129
+ supports_ai_fallback=True,
130
+ )
131
+
132
+ def supported_patterns(self) -> list[re.Pattern[str]]:
133
+ """Return compiled URL patterns handled by this adapter."""
134
+ return [re.compile(pattern, re.IGNORECASE) for pattern in self._URL_PATTERNS]
135
+
136
+ def preferred_strategies(self) -> list[str]:
137
+ """Facebook is heavily JavaScript-rendered; prefer browser."""
138
+ return ["browser", "http"]
139
+
140
+ async def collect(
141
+ self,
142
+ url: str,
143
+ _strategy: str,
144
+ collector: Collector,
145
+ options: CollectionOptions,
146
+ ) -> RawResponse:
147
+ """Collect raw HTML for ``url`` and flag non-public content."""
148
+ raw_response = await collector.collect(url, options)
149
+ if not self._is_public_content(raw_response.html):
150
+ raw_response.error = {
151
+ "code": "SCR_061",
152
+ "message": "Authentication required -- this content is not publicly accessible.",
153
+ }
154
+ return raw_response
155
+
156
+ async def extract(self, raw_response: RawResponse) -> dict[str, Any]:
157
+ """Extract structured Facebook fields from ``raw_response``."""
158
+ soup = BeautifulSoup(raw_response.html, "html.parser")
159
+ url = raw_response.final_url or raw_response.url
160
+ content_type = self._classify_url(url)
161
+
162
+ if content_type in {"post", "video", "reel"}:
163
+ return self._extract_post(soup, url, content_type)
164
+ return self._extract_page(soup, url)
165
+
166
+ async def normalize(
167
+ self,
168
+ extracted: dict[str, Any],
169
+ url: str,
170
+ strategy: str,
171
+ ) -> UnifiedOutput:
172
+ """Convert extracted Facebook fields into ``UnifiedOutput``."""
173
+ content_type = extracted.get("content_type", "post")
174
+ selectors_used = list(extracted.get("selectors_used", []))
175
+
176
+ base_output: dict[str, Any] = {
177
+ "url": url,
178
+ "platform": self.manifest.platforms[0],
179
+ "content_type": content_type,
180
+ "scraping_strategy": strategy,
181
+ "selectors_used": selectors_used,
182
+ }
183
+
184
+ if content_type == "profile":
185
+ base_output.update(self._normalize_page(extracted, url))
186
+ else:
187
+ base_output.update(self._normalize_post(extracted, url))
188
+
189
+ return UnifiedOutput(**base_output)
190
+
191
+ def health_check(self) -> dict[str, Any]:
192
+ """Return Facebook adapter health metadata."""
193
+ base = super().health_check()
194
+ base["requires_auth"] = self.manifest.requires_auth
195
+ return base
196
+
197
+ # ------------------------------------------------------------------
198
+ # URL classification
199
+ # ------------------------------------------------------------------
200
+
201
+ def _classify_url(self, url: str) -> str:
202
+ """Classify a Facebook URL into a content type."""
203
+ path = urlparse(url).path.lower()
204
+ if "/posts/" in path or "/photos/" in path:
205
+ return "post"
206
+ if "/videos/" in path:
207
+ return "video"
208
+ if "/reel/" in path:
209
+ return "reel"
210
+ return "profile"
211
+
212
+ # ------------------------------------------------------------------
213
+ # Page extraction
214
+ # ------------------------------------------------------------------
215
+
216
+ def _extract_page(self, soup: BeautifulSoup, url: str) -> dict[str, Any]:
217
+ """Extract public Facebook page fields."""
218
+ text_results = self._extract_with_selectors(soup, self._PAGE_SELECTOR_SETS)
219
+ selectors_used = self._collect_selectors(text_results)
220
+
221
+ website = self._extract_attribute(
222
+ soup,
223
+ self._PAGE_SELECTOR_SETS["website"],
224
+ "href",
225
+ )
226
+ if website["selector_used"]:
227
+ selectors_used.append(website["selector_used"])
228
+
229
+ recent_posts = self._extract_recent_posts(soup)
230
+
231
+ return {
232
+ "content_type": "profile",
233
+ "platform": "facebook",
234
+ "url": url,
235
+ "name": text_results["name"]["value"],
236
+ "category": text_results["category"]["value"],
237
+ "followers_count": self._parse_engagement(
238
+ self._clean_count_text(text_results["followers_count"]["value"]),
239
+ ),
240
+ "likes_count": self._parse_engagement(
241
+ self._clean_count_text(text_results["likes_count"]["value"]),
242
+ ),
243
+ "description": text_results["description"]["value"],
244
+ "website": self._resolve_url(
245
+ website["value"],
246
+ url,
247
+ ),
248
+ "recent_posts": recent_posts,
249
+ "selectors_used": selectors_used,
250
+ "is_public": self._is_public_content(str(soup)),
251
+ }
252
+
253
+ def _extract_recent_posts(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
254
+ """Extract recent post previews from a page feed."""
255
+ posts: list[dict[str, Any]] = []
256
+ for post in soup.select(".fb-recent-posts .fb-post-preview"):
257
+ post_id = post.get("data-post-id")
258
+ text_el = post.select_one(".fb-post-preview-text")
259
+ text = text_el.get_text(strip=True) if text_el else None
260
+ posts.append(
261
+ {
262
+ "post_id": str(post_id) if post_id else None,
263
+ "text": text,
264
+ },
265
+ )
266
+ return posts
267
+
268
+ # ------------------------------------------------------------------
269
+ # Post extraction
270
+ # ------------------------------------------------------------------
271
+
272
+ def _extract_post(
273
+ self,
274
+ soup: BeautifulSoup,
275
+ url: str,
276
+ content_type: str = "post",
277
+ ) -> dict[str, Any]:
278
+ """Extract public Facebook post fields."""
279
+ text_results = self._extract_with_selectors(soup, self._POST_SELECTOR_SETS)
280
+ selectors_used = self._collect_selectors(text_results)
281
+
282
+ post_id = self._extract_attribute(
283
+ soup,
284
+ ["article.fb-story[data-story-id]", ".fb-story[data-story-id]", "[data-story-id]"],
285
+ "data-story-id",
286
+ )
287
+ if post_id["selector_used"]:
288
+ selectors_used.append(post_id["selector_used"])
289
+
290
+ timestamp = self._extract_attribute(
291
+ soup,
292
+ ["time.timestamp", "time", ".timestamp"],
293
+ "datetime",
294
+ )
295
+ if timestamp["selector_used"]:
296
+ selectors_used.append(timestamp["selector_used"])
297
+
298
+ author_url = self._extract_author_url(soup, url)
299
+ reactions_breakdown = self._extract_reactions_breakdown(soup)
300
+ media_urls = self._extract_media_urls(soup, url)
301
+
302
+ return {
303
+ "content_type": content_type,
304
+ "platform": "facebook",
305
+ "url": url,
306
+ "id": post_id["value"] or self._parse_post_id_from_url(url),
307
+ "author": text_results["author"]["value"],
308
+ "author_url": author_url,
309
+ "text": text_results["text"]["value"],
310
+ "timestamp": self._parse_iso_timestamp(timestamp["value"]),
311
+ "reaction_count": self._parse_engagement(
312
+ self._clean_count_text(text_results["reaction_count"]["value"]),
313
+ ),
314
+ "comment_count": self._parse_engagement(
315
+ self._clean_count_text(text_results["comment_count"]["value"]),
316
+ ),
317
+ "share_count": self._parse_engagement(
318
+ self._clean_count_text(text_results["share_count"]["value"]),
319
+ ),
320
+ "reactions_breakdown": reactions_breakdown,
321
+ "media_urls": media_urls,
322
+ "selectors_used": selectors_used,
323
+ "is_public": self._is_public_content(str(soup)),
324
+ }
325
+
326
+ def _parse_post_id_from_url(self, url: str) -> str | None:
327
+ """Parse the post ID from the URL path when not found in HTML."""
328
+ parts = [p for p in urlparse(url).path.split("/") if p]
329
+ if parts and parts[-1] not in {"posts", "photos", "videos", "reel"}:
330
+ return parts[-1]
331
+ return None
332
+
333
+ def _extract_author_url(self, soup: BeautifulSoup, url: str) -> str | None:
334
+ """Extract the author profile URL from the post."""
335
+ link = soup.select_one(".fb-author-link[href]")
336
+ if link and link.has_attr("href"):
337
+ return self._resolve_url(str(link["href"]), url)
338
+ return None
339
+
340
+ def _extract_reactions_breakdown(
341
+ self,
342
+ soup: BeautifulSoup,
343
+ ) -> dict[str, int | None]:
344
+ """Extract individual reaction counts when visible."""
345
+ reactions: dict[str, int | None] = {}
346
+ for reaction_type in ("like", "love", "wow", "haha", "sad", "angry"):
347
+ result = self._extract_with_selectors(
348
+ soup,
349
+ {
350
+ reaction_type: [f".fb-reaction-{reaction_type}"],
351
+ },
352
+ )
353
+ reactions[reaction_type] = self._parse_engagement(
354
+ result[reaction_type]["value"],
355
+ )
356
+ return reactions
357
+
358
+ def _extract_media_urls(self, soup: BeautifulSoup, url: str) -> list[str]:
359
+ """Extract image/video URLs attached to the post."""
360
+ urls: list[str] = []
361
+ for img in soup.select(".fb-media img[src], .fb-media video[src]"):
362
+ src = img.get("src")
363
+ if src:
364
+ resolved = self._resolve_url(str(src), url)
365
+ if resolved:
366
+ urls.append(resolved)
367
+ return urls
368
+
369
+ # ------------------------------------------------------------------
370
+ # Normalization helpers
371
+ # ------------------------------------------------------------------
372
+
373
+ def _normalize_page(
374
+ self,
375
+ extracted: dict[str, Any],
376
+ url: str,
377
+ ) -> dict[str, Any]:
378
+ """Map page fields to UnifiedOutput fields."""
379
+ return {
380
+ "title": extracted.get("name"),
381
+ "text": extracted.get("description"),
382
+ "author": extracted.get("name"),
383
+ "author_url": url,
384
+ "likes": extracted.get("likes_count"),
385
+ "views": None,
386
+ "comments": None,
387
+ "shares": None,
388
+ "media_urls": [],
389
+ "thumbnail_url": None,
390
+ "tags": [],
391
+ }
392
+
393
+ def _normalize_post(
394
+ self,
395
+ extracted: dict[str, Any],
396
+ _url: str,
397
+ ) -> dict[str, Any]:
398
+ """Map post fields to UnifiedOutput fields."""
399
+ text = extracted.get("text")
400
+ title = text.split("\n")[0] if isinstance(text, str) and text else None
401
+ media_urls = list(extracted.get("media_urls", []))
402
+ return {
403
+ "title": title,
404
+ "text": text,
405
+ "author": extracted.get("author"),
406
+ "author_url": extracted.get("author_url"),
407
+ "timestamp": extracted.get("timestamp"),
408
+ "likes": extracted.get("reaction_count"),
409
+ "shares": extracted.get("share_count"),
410
+ "comments": extracted.get("comment_count"),
411
+ "views": None,
412
+ "media_urls": media_urls,
413
+ "thumbnail_url": media_urls[0] if media_urls else None,
414
+ "tags": [],
415
+ }
416
+
417
+ # ------------------------------------------------------------------
418
+ # Shared helpers
419
+ # ------------------------------------------------------------------
420
+
421
+ def _extract_attribute(
422
+ self,
423
+ soup: BeautifulSoup,
424
+ selectors: list[str],
425
+ attribute: str,
426
+ ) -> dict[str, Any]:
427
+ """Extract an HTML attribute using selector fallback chains."""
428
+ for selector in selectors:
429
+ elements = soup.select(selector)
430
+ if elements and elements[0].has_attr(attribute):
431
+ return {
432
+ "value": str(elements[0][attribute]),
433
+ "selector_used": selector,
434
+ "matched": True,
435
+ }
436
+ return {"value": None, "selector_used": None, "matched": False}
437
+
438
+ def _collect_selectors(
439
+ self,
440
+ results: dict[str, dict[str, Any]],
441
+ ) -> list[str]:
442
+ """Collect selectors that successfully matched."""
443
+ selectors: list[str] = []
444
+ for result in results.values():
445
+ selector_used = result.get("selector_used")
446
+ if selector_used:
447
+ selectors.append(selector_used)
448
+ return selectors
449
+
450
+ def _resolve_url(self, value: str | None, base_url: str) -> str | None:
451
+ """Resolve a possibly relative URL against ``base_url``."""
452
+ if not value:
453
+ return None
454
+ stripped = value.strip()
455
+ if not stripped:
456
+ return None
457
+ return urljoin(base_url, stripped)
458
+
459
+ def _parse_iso_timestamp(self, value: str | None) -> datetime | None:
460
+ """Parse an ISO 8601 timestamp string into a UTC datetime."""
461
+ if not value:
462
+ return None
463
+ try:
464
+ parsed = datetime.fromisoformat(value.strip())
465
+ return parsed.astimezone(UTC)
466
+ except ValueError:
467
+ return None
468
+
469
+ def _is_public_content(self, html: str) -> bool:
470
+ """Return ``True`` when ``html`` appears to be publicly accessible."""
471
+ text = BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)
472
+ text_lower = text.lower()
473
+ return not any(indicator in text_lower for indicator in self._PRIVATE_INDICATORS)
474
+
475
+ def _clean_count_text(self, text: str | None) -> str | None:
476
+ """Extract a leading numeric count such as ``1.2K`` from noisy text."""
477
+ if not text:
478
+ return None
479
+ match = re.search(r"[\d\.,]+\s*[KMBkmb]?", text)
480
+ if match:
481
+ return match.group(0).strip()
482
+ return None
File without changes
@@ -0,0 +1,76 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+
6
+ from bs4 import BeautifulSoup
7
+
8
+ from phoenix.adapters.base import BaseAdapter
9
+ from phoenix.collectors.base import Collector
10
+ from phoenix.models.document import RawResponse
11
+ from phoenix.models.output import UnifiedOutput
12
+ from phoenix.options import CollectionOptions
13
+ from phoenix.plugins.manifest import PluginManifest
14
+ from phoenix.processing.normalizer import Normalizer
15
+
16
+
17
+ def _select_values(soup: BeautifulSoup, selector: str) -> list[str]:
18
+ attr_match = re.search(r"::attr\(([^)]+)\)$", selector)
19
+ if attr_match:
20
+ plain_selector = selector[: attr_match.start()]
21
+ attribute_name = attr_match.group(1)
22
+ return [
23
+ str(value)
24
+ for element in soup.select(plain_selector)
25
+ if (value := element.get(attribute_name))
26
+ ]
27
+ if selector.endswith("::text"):
28
+ plain_selector = selector[: -len("::text")]
29
+ return [element.get_text(strip=True) for element in soup.select(plain_selector)]
30
+ return [element.get_text(strip=True) for element in soup.select(selector)]
31
+
32
+
33
+ class QuotesToScrapeAdapter(BaseAdapter):
34
+ manifest = PluginManifest(
35
+ name="quotes_to_scrape_adapter",
36
+ version="1.0.0",
37
+ platforms=["quotes_to_scrape"],
38
+ url_patterns=["^https://quotes.toscrape.com/$"],
39
+ generated=True,
40
+ )
41
+
42
+ def supported_patterns(self) -> list[re.Pattern[str]]:
43
+ return [re.compile(pattern) for pattern in self.manifest.url_patterns]
44
+
45
+ async def collect(
46
+ self,
47
+ url: str,
48
+ strategy: str,
49
+ collector: Collector,
50
+ options: CollectionOptions,
51
+ ) -> RawResponse:
52
+ raw_response = await collector.collect(url, options)
53
+ if not self._is_public_content(raw_response.html):
54
+ raise ValueError("Content is not publicly accessible")
55
+ return raw_response
56
+
57
+ async def extract(self, raw_response: RawResponse) -> dict[str, Any]:
58
+ soup = BeautifulSoup(raw_response.html, "html.parser")
59
+ return {
60
+ "quote_text": _select_values(soup, '.text[itemprop="text"]'),
61
+ "quote_text_confidence": [1.0 for _ in _select_values(soup, '.text[itemprop="text"]')],
62
+ "author_name": _select_values(soup, '.author[itemprop="author"]::text'),
63
+ "author_name_confidence": [
64
+ 1.0 for _ in _select_values(soup, '.author[itemprop="author"]::text')
65
+ ],
66
+ "tags": _select_values(soup, ".tags a.tag::attr(href)"),
67
+ "tags_confidence": [1.0 for _ in _select_values(soup, ".tags a.tag::attr(href)")],
68
+ }
69
+
70
+ async def normalize(
71
+ self,
72
+ extracted: dict[str, Any],
73
+ url: str,
74
+ strategy: str,
75
+ ) -> UnifiedOutput:
76
+ return await Normalizer().normalize(extracted, "quotes_to_scrape", url, strategy)