phoenix-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoenix/__init__.py +41 -0
- phoenix/__main__.py +8 -0
- phoenix/adapters/__init__.py +25 -0
- phoenix/adapters/base.py +230 -0
- phoenix/adapters/facebook.py +482 -0
- phoenix/adapters/generated/__init__.py +0 -0
- phoenix/adapters/generated/quotes_to_scrape.py +76 -0
- phoenix/adapters/generic.py +189 -0
- phoenix/adapters/instagram.py +426 -0
- phoenix/adapters/linkedin.py +542 -0
- phoenix/adapters/tiktok.py +557 -0
- phoenix/adapters/x_twitter.py +401 -0
- phoenix/adapters/youtube.py +544 -0
- phoenix/architect/__init__.py +15 -0
- phoenix/architect/coder.py +150 -0
- phoenix/architect/critic.py +324 -0
- phoenix/architect/explorer.py +232 -0
- phoenix/architect/fixture_generator.py +256 -0
- phoenix/architect/inspector.py +111 -0
- phoenix/architect/orchestrator.py +403 -0
- phoenix/architect/researcher.py +187 -0
- phoenix/architect/template_generator.py +145 -0
- phoenix/architect/writer.py +108 -0
- phoenix/cli/__init__.py +7 -0
- phoenix/cli/main.py +725 -0
- phoenix/collectors/__init__.py +17 -0
- phoenix/collectors/base.py +81 -0
- phoenix/collectors/browser.py +209 -0
- phoenix/collectors/browser_pool.py +197 -0
- phoenix/collectors/direct.py +132 -0
- phoenix/engine.py +257 -0
- phoenix/exceptions.py +77 -0
- phoenix/infrastructure/__init__.py +40 -0
- phoenix/infrastructure/audit_logger.py +68 -0
- phoenix/infrastructure/config.py +134 -0
- phoenix/infrastructure/license_manager.py +270 -0
- phoenix/infrastructure/rate_limiter.py +197 -0
- phoenix/infrastructure/session_manager.py +92 -0
- phoenix/infrastructure/storage.py +580 -0
- phoenix/infrastructure/vault.py +275 -0
- phoenix/intelligence/__init__.py +18 -0
- phoenix/intelligence/anti_bot_recovery.py +205 -0
- phoenix/intelligence/change_detector.py +314 -0
- phoenix/intelligence/classifier.py +179 -0
- phoenix/intelligence/entities.py +104 -0
- phoenix/intelligence/selector_health.py +139 -0
- phoenix/intelligence/selector_repair.py +35 -0
- phoenix/models/__init__.py +34 -0
- phoenix/models/classification.py +19 -0
- phoenix/models/config.py +207 -0
- phoenix/models/document.py +70 -0
- phoenix/models/output.py +182 -0
- phoenix/models/session.py +26 -0
- phoenix/models/strategy.py +28 -0
- phoenix/options.py +67 -0
- phoenix/pipeline.py +598 -0
- phoenix/plugins/__init__.py +9 -0
- phoenix/plugins/loader.py +266 -0
- phoenix/plugins/manifest.py +62 -0
- phoenix/plugins/registry.py +109 -0
- phoenix/processing/__init__.py +15 -0
- phoenix/processing/ai_assistant.py +101 -0
- phoenix/processing/archiver.py +124 -0
- phoenix/processing/domain_memory.py +304 -0
- phoenix/processing/html_extractor.py +79 -0
- phoenix/processing/normalizer.py +124 -0
- phoenix/processing/phoenix_ai_extractor.py +436 -0
- phoenix/py.typed +0 -0
- phoenix/router.py +304 -0
- phoenix/scrapers/__init__.py +33 -0
- phoenix/scrapers/base.py +13 -0
- phoenix/scrapers/browser.py +9 -0
- phoenix/scrapers/http.py +9 -0
- phoenix/scrapers/selector_engine.py +38 -0
- phoenix/stealth/__init__.py +21 -0
- phoenix/stealth/captcha.py +143 -0
- phoenix/stealth/humanizer.py +101 -0
- phoenix/stealth/profile.py +134 -0
- phoenix/stealth/rotator.py +87 -0
- phoenix/stealth/warming.py +56 -0
- phoenix/strategy_selector.py +145 -0
- phoenix/version.py +7 -0
- phoenix_engine-0.1.0.dist-info/METADATA +187 -0
- phoenix_engine-0.1.0.dist-info/RECORD +87 -0
- phoenix_engine-0.1.0.dist-info/WHEEL +5 -0
- phoenix_engine-0.1.0.dist-info/entry_points.txt +2 -0
- phoenix_engine-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,482 @@
|
|
|
1
|
+
"""Facebook platform adapter for Phoenix Engine.
|
|
2
|
+
|
|
3
|
+
Extracts structured data from public Facebook pages and posts using CSS
|
|
4
|
+
selector fallback chains. No official Facebook API is used.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from datetime import UTC, datetime
|
|
11
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
12
|
+
from urllib.parse import urljoin, urlparse
|
|
13
|
+
|
|
14
|
+
from bs4 import BeautifulSoup
|
|
15
|
+
|
|
16
|
+
from phoenix.adapters.base import BaseAdapter
|
|
17
|
+
from phoenix.models.output import UnifiedOutput
|
|
18
|
+
from phoenix.plugins.manifest import PluginManifest
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from phoenix.collectors.base import Collector
|
|
22
|
+
from phoenix.models.document import RawResponse
|
|
23
|
+
from phoenix.options import CollectionOptions
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FacebookAdapter(BaseAdapter):
|
|
27
|
+
"""Adapter for scraping public Facebook pages and posts."""
|
|
28
|
+
|
|
29
|
+
_URL_PATTERNS: ClassVar[list[str]] = [
|
|
30
|
+
r"https?://(?:www\.)?facebook\.com/[^/]+/posts/[^/]+",
|
|
31
|
+
r"https?://(?:www\.)?facebook\.com/[^/]+/photos/[^/]+",
|
|
32
|
+
r"https?://(?:www\.)?facebook\.com/[^/]+/videos/[^/]+",
|
|
33
|
+
r"https?://(?:www\.)?facebook\.com/[^/]+/reel/[^/]+",
|
|
34
|
+
r"https?://(?:www\.)?facebook\.com/pages/[^/]+",
|
|
35
|
+
r"https?://(?:www\.)?facebook\.com/[^/]+",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
_PAGE_SELECTOR_SETS: ClassVar[dict[str, list[str]]] = {
|
|
39
|
+
"name": [
|
|
40
|
+
"h1.fb-page-name",
|
|
41
|
+
".fb-page-name",
|
|
42
|
+
"[data-page-id] h1",
|
|
43
|
+
],
|
|
44
|
+
"category": [
|
|
45
|
+
".fb-page-category",
|
|
46
|
+
"[data-page-category]",
|
|
47
|
+
],
|
|
48
|
+
"followers_count": [
|
|
49
|
+
".fb-followers-count",
|
|
50
|
+
'.fb-page-counts [data-field="followers"]',
|
|
51
|
+
],
|
|
52
|
+
"likes_count": [
|
|
53
|
+
".fb-likes-count",
|
|
54
|
+
'.fb-page-counts [data-field="likes"]',
|
|
55
|
+
],
|
|
56
|
+
"description": [
|
|
57
|
+
".fb-page-description",
|
|
58
|
+
".fb-page-about p",
|
|
59
|
+
],
|
|
60
|
+
"website": [
|
|
61
|
+
".fb-page-website",
|
|
62
|
+
".fb-page-about a[href]",
|
|
63
|
+
],
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
_POST_SELECTOR_SETS: ClassVar[dict[str, list[str]]] = {
|
|
67
|
+
"author": [
|
|
68
|
+
".fb-author.author-name",
|
|
69
|
+
".fb-author",
|
|
70
|
+
"[data-author-name]",
|
|
71
|
+
],
|
|
72
|
+
"text": [
|
|
73
|
+
".fb-user-content.post-caption p",
|
|
74
|
+
".fb-user-content",
|
|
75
|
+
".post-caption",
|
|
76
|
+
],
|
|
77
|
+
"reaction_count": [
|
|
78
|
+
".fb-reaction-count.like-count",
|
|
79
|
+
".fb-reaction-count",
|
|
80
|
+
".like-count",
|
|
81
|
+
],
|
|
82
|
+
"comment_count": [
|
|
83
|
+
".fb-comment-count.comment-count",
|
|
84
|
+
".fb-comment-count",
|
|
85
|
+
".comment-count",
|
|
86
|
+
],
|
|
87
|
+
"share_count": [
|
|
88
|
+
".fb-share-count.share-count",
|
|
89
|
+
".fb-share-count",
|
|
90
|
+
".share-count",
|
|
91
|
+
],
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
_PRIVATE_INDICATORS: ClassVar[list[str]] = [
|
|
95
|
+
"log in to",
|
|
96
|
+
"login to",
|
|
97
|
+
"sign in to",
|
|
98
|
+
"signin to",
|
|
99
|
+
"this content isn't available",
|
|
100
|
+
"this page isn't available",
|
|
101
|
+
"page not found",
|
|
102
|
+
"sorry, this page isn't available",
|
|
103
|
+
"please log in",
|
|
104
|
+
"please sign in",
|
|
105
|
+
"you must log in",
|
|
106
|
+
"you must sign in",
|
|
107
|
+
"members only",
|
|
108
|
+
"private group",
|
|
109
|
+
"this account is private",
|
|
110
|
+
"this profile is private",
|
|
111
|
+
"friends only",
|
|
112
|
+
"only friends",
|
|
113
|
+
"only available to friends",
|
|
114
|
+
"this content isn't available right now",
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def manifest(self) -> PluginManifest:
|
|
119
|
+
"""Return the Facebook adapter manifest."""
|
|
120
|
+
return PluginManifest(
|
|
121
|
+
name="facebook",
|
|
122
|
+
version="0.1.0",
|
|
123
|
+
description="Scraper for public Facebook pages and posts.",
|
|
124
|
+
author="Phoenix Engine Team",
|
|
125
|
+
platforms=["facebook"],
|
|
126
|
+
url_patterns=list(self._URL_PATTERNS),
|
|
127
|
+
strategies=["browser", "http"],
|
|
128
|
+
requires_auth=True,
|
|
129
|
+
supports_ai_fallback=True,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def supported_patterns(self) -> list[re.Pattern[str]]:
|
|
133
|
+
"""Return compiled URL patterns handled by this adapter."""
|
|
134
|
+
return [re.compile(pattern, re.IGNORECASE) for pattern in self._URL_PATTERNS]
|
|
135
|
+
|
|
136
|
+
def preferred_strategies(self) -> list[str]:
|
|
137
|
+
"""Facebook is heavily JavaScript-rendered; prefer browser."""
|
|
138
|
+
return ["browser", "http"]
|
|
139
|
+
|
|
140
|
+
async def collect(
|
|
141
|
+
self,
|
|
142
|
+
url: str,
|
|
143
|
+
_strategy: str,
|
|
144
|
+
collector: Collector,
|
|
145
|
+
options: CollectionOptions,
|
|
146
|
+
) -> RawResponse:
|
|
147
|
+
"""Collect raw HTML for ``url`` and flag non-public content."""
|
|
148
|
+
raw_response = await collector.collect(url, options)
|
|
149
|
+
if not self._is_public_content(raw_response.html):
|
|
150
|
+
raw_response.error = {
|
|
151
|
+
"code": "SCR_061",
|
|
152
|
+
"message": "Authentication required -- this content is not publicly accessible.",
|
|
153
|
+
}
|
|
154
|
+
return raw_response
|
|
155
|
+
|
|
156
|
+
async def extract(self, raw_response: RawResponse) -> dict[str, Any]:
|
|
157
|
+
"""Extract structured Facebook fields from ``raw_response``."""
|
|
158
|
+
soup = BeautifulSoup(raw_response.html, "html.parser")
|
|
159
|
+
url = raw_response.final_url or raw_response.url
|
|
160
|
+
content_type = self._classify_url(url)
|
|
161
|
+
|
|
162
|
+
if content_type in {"post", "video", "reel"}:
|
|
163
|
+
return self._extract_post(soup, url, content_type)
|
|
164
|
+
return self._extract_page(soup, url)
|
|
165
|
+
|
|
166
|
+
async def normalize(
|
|
167
|
+
self,
|
|
168
|
+
extracted: dict[str, Any],
|
|
169
|
+
url: str,
|
|
170
|
+
strategy: str,
|
|
171
|
+
) -> UnifiedOutput:
|
|
172
|
+
"""Convert extracted Facebook fields into ``UnifiedOutput``."""
|
|
173
|
+
content_type = extracted.get("content_type", "post")
|
|
174
|
+
selectors_used = list(extracted.get("selectors_used", []))
|
|
175
|
+
|
|
176
|
+
base_output: dict[str, Any] = {
|
|
177
|
+
"url": url,
|
|
178
|
+
"platform": self.manifest.platforms[0],
|
|
179
|
+
"content_type": content_type,
|
|
180
|
+
"scraping_strategy": strategy,
|
|
181
|
+
"selectors_used": selectors_used,
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if content_type == "profile":
|
|
185
|
+
base_output.update(self._normalize_page(extracted, url))
|
|
186
|
+
else:
|
|
187
|
+
base_output.update(self._normalize_post(extracted, url))
|
|
188
|
+
|
|
189
|
+
return UnifiedOutput(**base_output)
|
|
190
|
+
|
|
191
|
+
def health_check(self) -> dict[str, Any]:
|
|
192
|
+
"""Return Facebook adapter health metadata."""
|
|
193
|
+
base = super().health_check()
|
|
194
|
+
base["requires_auth"] = self.manifest.requires_auth
|
|
195
|
+
return base
|
|
196
|
+
|
|
197
|
+
# ------------------------------------------------------------------
|
|
198
|
+
# URL classification
|
|
199
|
+
# ------------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
def _classify_url(self, url: str) -> str:
|
|
202
|
+
"""Classify a Facebook URL into a content type."""
|
|
203
|
+
path = urlparse(url).path.lower()
|
|
204
|
+
if "/posts/" in path or "/photos/" in path:
|
|
205
|
+
return "post"
|
|
206
|
+
if "/videos/" in path:
|
|
207
|
+
return "video"
|
|
208
|
+
if "/reel/" in path:
|
|
209
|
+
return "reel"
|
|
210
|
+
return "profile"
|
|
211
|
+
|
|
212
|
+
# ------------------------------------------------------------------
|
|
213
|
+
# Page extraction
|
|
214
|
+
# ------------------------------------------------------------------
|
|
215
|
+
|
|
216
|
+
def _extract_page(self, soup: BeautifulSoup, url: str) -> dict[str, Any]:
|
|
217
|
+
"""Extract public Facebook page fields."""
|
|
218
|
+
text_results = self._extract_with_selectors(soup, self._PAGE_SELECTOR_SETS)
|
|
219
|
+
selectors_used = self._collect_selectors(text_results)
|
|
220
|
+
|
|
221
|
+
website = self._extract_attribute(
|
|
222
|
+
soup,
|
|
223
|
+
self._PAGE_SELECTOR_SETS["website"],
|
|
224
|
+
"href",
|
|
225
|
+
)
|
|
226
|
+
if website["selector_used"]:
|
|
227
|
+
selectors_used.append(website["selector_used"])
|
|
228
|
+
|
|
229
|
+
recent_posts = self._extract_recent_posts(soup)
|
|
230
|
+
|
|
231
|
+
return {
|
|
232
|
+
"content_type": "profile",
|
|
233
|
+
"platform": "facebook",
|
|
234
|
+
"url": url,
|
|
235
|
+
"name": text_results["name"]["value"],
|
|
236
|
+
"category": text_results["category"]["value"],
|
|
237
|
+
"followers_count": self._parse_engagement(
|
|
238
|
+
self._clean_count_text(text_results["followers_count"]["value"]),
|
|
239
|
+
),
|
|
240
|
+
"likes_count": self._parse_engagement(
|
|
241
|
+
self._clean_count_text(text_results["likes_count"]["value"]),
|
|
242
|
+
),
|
|
243
|
+
"description": text_results["description"]["value"],
|
|
244
|
+
"website": self._resolve_url(
|
|
245
|
+
website["value"],
|
|
246
|
+
url,
|
|
247
|
+
),
|
|
248
|
+
"recent_posts": recent_posts,
|
|
249
|
+
"selectors_used": selectors_used,
|
|
250
|
+
"is_public": self._is_public_content(str(soup)),
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
def _extract_recent_posts(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
|
|
254
|
+
"""Extract recent post previews from a page feed."""
|
|
255
|
+
posts: list[dict[str, Any]] = []
|
|
256
|
+
for post in soup.select(".fb-recent-posts .fb-post-preview"):
|
|
257
|
+
post_id = post.get("data-post-id")
|
|
258
|
+
text_el = post.select_one(".fb-post-preview-text")
|
|
259
|
+
text = text_el.get_text(strip=True) if text_el else None
|
|
260
|
+
posts.append(
|
|
261
|
+
{
|
|
262
|
+
"post_id": str(post_id) if post_id else None,
|
|
263
|
+
"text": text,
|
|
264
|
+
},
|
|
265
|
+
)
|
|
266
|
+
return posts
|
|
267
|
+
|
|
268
|
+
# ------------------------------------------------------------------
|
|
269
|
+
# Post extraction
|
|
270
|
+
# ------------------------------------------------------------------
|
|
271
|
+
|
|
272
|
+
def _extract_post(
|
|
273
|
+
self,
|
|
274
|
+
soup: BeautifulSoup,
|
|
275
|
+
url: str,
|
|
276
|
+
content_type: str = "post",
|
|
277
|
+
) -> dict[str, Any]:
|
|
278
|
+
"""Extract public Facebook post fields."""
|
|
279
|
+
text_results = self._extract_with_selectors(soup, self._POST_SELECTOR_SETS)
|
|
280
|
+
selectors_used = self._collect_selectors(text_results)
|
|
281
|
+
|
|
282
|
+
post_id = self._extract_attribute(
|
|
283
|
+
soup,
|
|
284
|
+
["article.fb-story[data-story-id]", ".fb-story[data-story-id]", "[data-story-id]"],
|
|
285
|
+
"data-story-id",
|
|
286
|
+
)
|
|
287
|
+
if post_id["selector_used"]:
|
|
288
|
+
selectors_used.append(post_id["selector_used"])
|
|
289
|
+
|
|
290
|
+
timestamp = self._extract_attribute(
|
|
291
|
+
soup,
|
|
292
|
+
["time.timestamp", "time", ".timestamp"],
|
|
293
|
+
"datetime",
|
|
294
|
+
)
|
|
295
|
+
if timestamp["selector_used"]:
|
|
296
|
+
selectors_used.append(timestamp["selector_used"])
|
|
297
|
+
|
|
298
|
+
author_url = self._extract_author_url(soup, url)
|
|
299
|
+
reactions_breakdown = self._extract_reactions_breakdown(soup)
|
|
300
|
+
media_urls = self._extract_media_urls(soup, url)
|
|
301
|
+
|
|
302
|
+
return {
|
|
303
|
+
"content_type": content_type,
|
|
304
|
+
"platform": "facebook",
|
|
305
|
+
"url": url,
|
|
306
|
+
"id": post_id["value"] or self._parse_post_id_from_url(url),
|
|
307
|
+
"author": text_results["author"]["value"],
|
|
308
|
+
"author_url": author_url,
|
|
309
|
+
"text": text_results["text"]["value"],
|
|
310
|
+
"timestamp": self._parse_iso_timestamp(timestamp["value"]),
|
|
311
|
+
"reaction_count": self._parse_engagement(
|
|
312
|
+
self._clean_count_text(text_results["reaction_count"]["value"]),
|
|
313
|
+
),
|
|
314
|
+
"comment_count": self._parse_engagement(
|
|
315
|
+
self._clean_count_text(text_results["comment_count"]["value"]),
|
|
316
|
+
),
|
|
317
|
+
"share_count": self._parse_engagement(
|
|
318
|
+
self._clean_count_text(text_results["share_count"]["value"]),
|
|
319
|
+
),
|
|
320
|
+
"reactions_breakdown": reactions_breakdown,
|
|
321
|
+
"media_urls": media_urls,
|
|
322
|
+
"selectors_used": selectors_used,
|
|
323
|
+
"is_public": self._is_public_content(str(soup)),
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
def _parse_post_id_from_url(self, url: str) -> str | None:
|
|
327
|
+
"""Parse the post ID from the URL path when not found in HTML."""
|
|
328
|
+
parts = [p for p in urlparse(url).path.split("/") if p]
|
|
329
|
+
if parts and parts[-1] not in {"posts", "photos", "videos", "reel"}:
|
|
330
|
+
return parts[-1]
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
def _extract_author_url(self, soup: BeautifulSoup, url: str) -> str | None:
|
|
334
|
+
"""Extract the author profile URL from the post."""
|
|
335
|
+
link = soup.select_one(".fb-author-link[href]")
|
|
336
|
+
if link and link.has_attr("href"):
|
|
337
|
+
return self._resolve_url(str(link["href"]), url)
|
|
338
|
+
return None
|
|
339
|
+
|
|
340
|
+
def _extract_reactions_breakdown(
|
|
341
|
+
self,
|
|
342
|
+
soup: BeautifulSoup,
|
|
343
|
+
) -> dict[str, int | None]:
|
|
344
|
+
"""Extract individual reaction counts when visible."""
|
|
345
|
+
reactions: dict[str, int | None] = {}
|
|
346
|
+
for reaction_type in ("like", "love", "wow", "haha", "sad", "angry"):
|
|
347
|
+
result = self._extract_with_selectors(
|
|
348
|
+
soup,
|
|
349
|
+
{
|
|
350
|
+
reaction_type: [f".fb-reaction-{reaction_type}"],
|
|
351
|
+
},
|
|
352
|
+
)
|
|
353
|
+
reactions[reaction_type] = self._parse_engagement(
|
|
354
|
+
result[reaction_type]["value"],
|
|
355
|
+
)
|
|
356
|
+
return reactions
|
|
357
|
+
|
|
358
|
+
def _extract_media_urls(self, soup: BeautifulSoup, url: str) -> list[str]:
|
|
359
|
+
"""Extract image/video URLs attached to the post."""
|
|
360
|
+
urls: list[str] = []
|
|
361
|
+
for img in soup.select(".fb-media img[src], .fb-media video[src]"):
|
|
362
|
+
src = img.get("src")
|
|
363
|
+
if src:
|
|
364
|
+
resolved = self._resolve_url(str(src), url)
|
|
365
|
+
if resolved:
|
|
366
|
+
urls.append(resolved)
|
|
367
|
+
return urls
|
|
368
|
+
|
|
369
|
+
# ------------------------------------------------------------------
|
|
370
|
+
# Normalization helpers
|
|
371
|
+
# ------------------------------------------------------------------
|
|
372
|
+
|
|
373
|
+
def _normalize_page(
|
|
374
|
+
self,
|
|
375
|
+
extracted: dict[str, Any],
|
|
376
|
+
url: str,
|
|
377
|
+
) -> dict[str, Any]:
|
|
378
|
+
"""Map page fields to UnifiedOutput fields."""
|
|
379
|
+
return {
|
|
380
|
+
"title": extracted.get("name"),
|
|
381
|
+
"text": extracted.get("description"),
|
|
382
|
+
"author": extracted.get("name"),
|
|
383
|
+
"author_url": url,
|
|
384
|
+
"likes": extracted.get("likes_count"),
|
|
385
|
+
"views": None,
|
|
386
|
+
"comments": None,
|
|
387
|
+
"shares": None,
|
|
388
|
+
"media_urls": [],
|
|
389
|
+
"thumbnail_url": None,
|
|
390
|
+
"tags": [],
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
def _normalize_post(
|
|
394
|
+
self,
|
|
395
|
+
extracted: dict[str, Any],
|
|
396
|
+
_url: str,
|
|
397
|
+
) -> dict[str, Any]:
|
|
398
|
+
"""Map post fields to UnifiedOutput fields."""
|
|
399
|
+
text = extracted.get("text")
|
|
400
|
+
title = text.split("\n")[0] if isinstance(text, str) and text else None
|
|
401
|
+
media_urls = list(extracted.get("media_urls", []))
|
|
402
|
+
return {
|
|
403
|
+
"title": title,
|
|
404
|
+
"text": text,
|
|
405
|
+
"author": extracted.get("author"),
|
|
406
|
+
"author_url": extracted.get("author_url"),
|
|
407
|
+
"timestamp": extracted.get("timestamp"),
|
|
408
|
+
"likes": extracted.get("reaction_count"),
|
|
409
|
+
"shares": extracted.get("share_count"),
|
|
410
|
+
"comments": extracted.get("comment_count"),
|
|
411
|
+
"views": None,
|
|
412
|
+
"media_urls": media_urls,
|
|
413
|
+
"thumbnail_url": media_urls[0] if media_urls else None,
|
|
414
|
+
"tags": [],
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
# ------------------------------------------------------------------
|
|
418
|
+
# Shared helpers
|
|
419
|
+
# ------------------------------------------------------------------
|
|
420
|
+
|
|
421
|
+
def _extract_attribute(
|
|
422
|
+
self,
|
|
423
|
+
soup: BeautifulSoup,
|
|
424
|
+
selectors: list[str],
|
|
425
|
+
attribute: str,
|
|
426
|
+
) -> dict[str, Any]:
|
|
427
|
+
"""Extract an HTML attribute using selector fallback chains."""
|
|
428
|
+
for selector in selectors:
|
|
429
|
+
elements = soup.select(selector)
|
|
430
|
+
if elements and elements[0].has_attr(attribute):
|
|
431
|
+
return {
|
|
432
|
+
"value": str(elements[0][attribute]),
|
|
433
|
+
"selector_used": selector,
|
|
434
|
+
"matched": True,
|
|
435
|
+
}
|
|
436
|
+
return {"value": None, "selector_used": None, "matched": False}
|
|
437
|
+
|
|
438
|
+
def _collect_selectors(
|
|
439
|
+
self,
|
|
440
|
+
results: dict[str, dict[str, Any]],
|
|
441
|
+
) -> list[str]:
|
|
442
|
+
"""Collect selectors that successfully matched."""
|
|
443
|
+
selectors: list[str] = []
|
|
444
|
+
for result in results.values():
|
|
445
|
+
selector_used = result.get("selector_used")
|
|
446
|
+
if selector_used:
|
|
447
|
+
selectors.append(selector_used)
|
|
448
|
+
return selectors
|
|
449
|
+
|
|
450
|
+
def _resolve_url(self, value: str | None, base_url: str) -> str | None:
|
|
451
|
+
"""Resolve a possibly relative URL against ``base_url``."""
|
|
452
|
+
if not value:
|
|
453
|
+
return None
|
|
454
|
+
stripped = value.strip()
|
|
455
|
+
if not stripped:
|
|
456
|
+
return None
|
|
457
|
+
return urljoin(base_url, stripped)
|
|
458
|
+
|
|
459
|
+
def _parse_iso_timestamp(self, value: str | None) -> datetime | None:
|
|
460
|
+
"""Parse an ISO 8601 timestamp string into a UTC datetime."""
|
|
461
|
+
if not value:
|
|
462
|
+
return None
|
|
463
|
+
try:
|
|
464
|
+
parsed = datetime.fromisoformat(value.strip())
|
|
465
|
+
return parsed.astimezone(UTC)
|
|
466
|
+
except ValueError:
|
|
467
|
+
return None
|
|
468
|
+
|
|
469
|
+
def _is_public_content(self, html: str) -> bool:
|
|
470
|
+
"""Return ``True`` when ``html`` appears to be publicly accessible."""
|
|
471
|
+
text = BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)
|
|
472
|
+
text_lower = text.lower()
|
|
473
|
+
return not any(indicator in text_lower for indicator in self._PRIVATE_INDICATORS)
|
|
474
|
+
|
|
475
|
+
def _clean_count_text(self, text: str | None) -> str | None:
|
|
476
|
+
"""Extract a leading numeric count such as ``1.2K`` from noisy text."""
|
|
477
|
+
if not text:
|
|
478
|
+
return None
|
|
479
|
+
match = re.search(r"[\d\.,]+\s*[KMBkmb]?", text)
|
|
480
|
+
if match:
|
|
481
|
+
return match.group(0).strip()
|
|
482
|
+
return None
|
|
File without changes
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
|
|
8
|
+
from phoenix.adapters.base import BaseAdapter
|
|
9
|
+
from phoenix.collectors.base import Collector
|
|
10
|
+
from phoenix.models.document import RawResponse
|
|
11
|
+
from phoenix.models.output import UnifiedOutput
|
|
12
|
+
from phoenix.options import CollectionOptions
|
|
13
|
+
from phoenix.plugins.manifest import PluginManifest
|
|
14
|
+
from phoenix.processing.normalizer import Normalizer
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _select_values(soup: BeautifulSoup, selector: str) -> list[str]:
|
|
18
|
+
attr_match = re.search(r"::attr\(([^)]+)\)$", selector)
|
|
19
|
+
if attr_match:
|
|
20
|
+
plain_selector = selector[: attr_match.start()]
|
|
21
|
+
attribute_name = attr_match.group(1)
|
|
22
|
+
return [
|
|
23
|
+
str(value)
|
|
24
|
+
for element in soup.select(plain_selector)
|
|
25
|
+
if (value := element.get(attribute_name))
|
|
26
|
+
]
|
|
27
|
+
if selector.endswith("::text"):
|
|
28
|
+
plain_selector = selector[: -len("::text")]
|
|
29
|
+
return [element.get_text(strip=True) for element in soup.select(plain_selector)]
|
|
30
|
+
return [element.get_text(strip=True) for element in soup.select(selector)]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class QuotesToScrapeAdapter(BaseAdapter):
|
|
34
|
+
manifest = PluginManifest(
|
|
35
|
+
name="quotes_to_scrape_adapter",
|
|
36
|
+
version="1.0.0",
|
|
37
|
+
platforms=["quotes_to_scrape"],
|
|
38
|
+
url_patterns=["^https://quotes.toscrape.com/$"],
|
|
39
|
+
generated=True,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def supported_patterns(self) -> list[re.Pattern[str]]:
|
|
43
|
+
return [re.compile(pattern) for pattern in self.manifest.url_patterns]
|
|
44
|
+
|
|
45
|
+
async def collect(
|
|
46
|
+
self,
|
|
47
|
+
url: str,
|
|
48
|
+
strategy: str,
|
|
49
|
+
collector: Collector,
|
|
50
|
+
options: CollectionOptions,
|
|
51
|
+
) -> RawResponse:
|
|
52
|
+
raw_response = await collector.collect(url, options)
|
|
53
|
+
if not self._is_public_content(raw_response.html):
|
|
54
|
+
raise ValueError("Content is not publicly accessible")
|
|
55
|
+
return raw_response
|
|
56
|
+
|
|
57
|
+
async def extract(self, raw_response: RawResponse) -> dict[str, Any]:
|
|
58
|
+
soup = BeautifulSoup(raw_response.html, "html.parser")
|
|
59
|
+
return {
|
|
60
|
+
"quote_text": _select_values(soup, '.text[itemprop="text"]'),
|
|
61
|
+
"quote_text_confidence": [1.0 for _ in _select_values(soup, '.text[itemprop="text"]')],
|
|
62
|
+
"author_name": _select_values(soup, '.author[itemprop="author"]::text'),
|
|
63
|
+
"author_name_confidence": [
|
|
64
|
+
1.0 for _ in _select_values(soup, '.author[itemprop="author"]::text')
|
|
65
|
+
],
|
|
66
|
+
"tags": _select_values(soup, ".tags a.tag::attr(href)"),
|
|
67
|
+
"tags_confidence": [1.0 for _ in _select_values(soup, ".tags a.tag::attr(href)")],
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
async def normalize(
|
|
71
|
+
self,
|
|
72
|
+
extracted: dict[str, Any],
|
|
73
|
+
url: str,
|
|
74
|
+
strategy: str,
|
|
75
|
+
) -> UnifiedOutput:
|
|
76
|
+
return await Normalizer().normalize(extracted, "quotes_to_scrape", url, strategy)
|