phoenix-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoenix/__init__.py +41 -0
- phoenix/__main__.py +8 -0
- phoenix/adapters/__init__.py +25 -0
- phoenix/adapters/base.py +230 -0
- phoenix/adapters/facebook.py +482 -0
- phoenix/adapters/generated/__init__.py +0 -0
- phoenix/adapters/generated/quotes_to_scrape.py +76 -0
- phoenix/adapters/generic.py +189 -0
- phoenix/adapters/instagram.py +426 -0
- phoenix/adapters/linkedin.py +542 -0
- phoenix/adapters/tiktok.py +557 -0
- phoenix/adapters/x_twitter.py +401 -0
- phoenix/adapters/youtube.py +544 -0
- phoenix/architect/__init__.py +15 -0
- phoenix/architect/coder.py +150 -0
- phoenix/architect/critic.py +324 -0
- phoenix/architect/explorer.py +232 -0
- phoenix/architect/fixture_generator.py +256 -0
- phoenix/architect/inspector.py +111 -0
- phoenix/architect/orchestrator.py +403 -0
- phoenix/architect/researcher.py +187 -0
- phoenix/architect/template_generator.py +145 -0
- phoenix/architect/writer.py +108 -0
- phoenix/cli/__init__.py +7 -0
- phoenix/cli/main.py +725 -0
- phoenix/collectors/__init__.py +17 -0
- phoenix/collectors/base.py +81 -0
- phoenix/collectors/browser.py +209 -0
- phoenix/collectors/browser_pool.py +197 -0
- phoenix/collectors/direct.py +132 -0
- phoenix/engine.py +257 -0
- phoenix/exceptions.py +77 -0
- phoenix/infrastructure/__init__.py +40 -0
- phoenix/infrastructure/audit_logger.py +68 -0
- phoenix/infrastructure/config.py +134 -0
- phoenix/infrastructure/license_manager.py +270 -0
- phoenix/infrastructure/rate_limiter.py +197 -0
- phoenix/infrastructure/session_manager.py +92 -0
- phoenix/infrastructure/storage.py +580 -0
- phoenix/infrastructure/vault.py +275 -0
- phoenix/intelligence/__init__.py +18 -0
- phoenix/intelligence/anti_bot_recovery.py +205 -0
- phoenix/intelligence/change_detector.py +314 -0
- phoenix/intelligence/classifier.py +179 -0
- phoenix/intelligence/entities.py +104 -0
- phoenix/intelligence/selector_health.py +139 -0
- phoenix/intelligence/selector_repair.py +35 -0
- phoenix/models/__init__.py +34 -0
- phoenix/models/classification.py +19 -0
- phoenix/models/config.py +207 -0
- phoenix/models/document.py +70 -0
- phoenix/models/output.py +182 -0
- phoenix/models/session.py +26 -0
- phoenix/models/strategy.py +28 -0
- phoenix/options.py +67 -0
- phoenix/pipeline.py +598 -0
- phoenix/plugins/__init__.py +9 -0
- phoenix/plugins/loader.py +266 -0
- phoenix/plugins/manifest.py +62 -0
- phoenix/plugins/registry.py +109 -0
- phoenix/processing/__init__.py +15 -0
- phoenix/processing/ai_assistant.py +101 -0
- phoenix/processing/archiver.py +124 -0
- phoenix/processing/domain_memory.py +304 -0
- phoenix/processing/html_extractor.py +79 -0
- phoenix/processing/normalizer.py +124 -0
- phoenix/processing/phoenix_ai_extractor.py +436 -0
- phoenix/py.typed +0 -0
- phoenix/router.py +304 -0
- phoenix/scrapers/__init__.py +33 -0
- phoenix/scrapers/base.py +13 -0
- phoenix/scrapers/browser.py +9 -0
- phoenix/scrapers/http.py +9 -0
- phoenix/scrapers/selector_engine.py +38 -0
- phoenix/stealth/__init__.py +21 -0
- phoenix/stealth/captcha.py +143 -0
- phoenix/stealth/humanizer.py +101 -0
- phoenix/stealth/profile.py +134 -0
- phoenix/stealth/rotator.py +87 -0
- phoenix/stealth/warming.py +56 -0
- phoenix/strategy_selector.py +145 -0
- phoenix/version.py +7 -0
- phoenix_engine-0.1.0.dist-info/METADATA +187 -0
- phoenix_engine-0.1.0.dist-info/RECORD +87 -0
- phoenix_engine-0.1.0.dist-info/WHEEL +5 -0
- phoenix_engine-0.1.0.dist-info/entry_points.txt +2 -0
- phoenix_engine-0.1.0.dist-info/top_level.txt +1 -0
phoenix/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Phoenix Engine -- universal pure web scraping engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from phoenix.adapters import (
|
|
6
|
+
BaseAdapter,
|
|
7
|
+
GenericWebAdapter,
|
|
8
|
+
PluginInterface,
|
|
9
|
+
ScraperPlugin,
|
|
10
|
+
)
|
|
11
|
+
from phoenix.engine import PhoenixEngine
|
|
12
|
+
from phoenix.models.output import (
|
|
13
|
+
CollectionResult,
|
|
14
|
+
ScrapingResult,
|
|
15
|
+
UnifiedOutput,
|
|
16
|
+
)
|
|
17
|
+
from phoenix.models.strategy import ScrapingStrategy
|
|
18
|
+
|
|
19
|
+
# Import options and core models before engine to prevent circular imports
|
|
20
|
+
# with router/pipeline submodules.
|
|
21
|
+
from phoenix.options import CollectionOptions, ScrapingOptions
|
|
22
|
+
from phoenix.plugins import PluginLoader, PluginManifest, PluginRegistry
|
|
23
|
+
from phoenix.version import __version__
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"BaseAdapter",
|
|
27
|
+
"CollectionOptions",
|
|
28
|
+
"CollectionResult",
|
|
29
|
+
"GenericWebAdapter",
|
|
30
|
+
"PhoenixEngine",
|
|
31
|
+
"PluginInterface",
|
|
32
|
+
"PluginLoader",
|
|
33
|
+
"PluginManifest",
|
|
34
|
+
"PluginRegistry",
|
|
35
|
+
"ScraperPlugin",
|
|
36
|
+
"ScrapingOptions",
|
|
37
|
+
"ScrapingResult",
|
|
38
|
+
"ScrapingStrategy",
|
|
39
|
+
"UnifiedOutput",
|
|
40
|
+
"__version__",
|
|
41
|
+
]
|
phoenix/__main__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Platform adapters that parse HTML into structured data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from phoenix.adapters.base import BaseAdapter, PluginInterface, ScraperPlugin
|
|
6
|
+
from phoenix.adapters.facebook import FacebookAdapter
|
|
7
|
+
from phoenix.adapters.generic import GenericWebAdapter
|
|
8
|
+
from phoenix.adapters.instagram import InstagramAdapter
|
|
9
|
+
from phoenix.adapters.linkedin import LinkedInAdapter
|
|
10
|
+
from phoenix.adapters.tiktok import TikTokAdapter
|
|
11
|
+
from phoenix.adapters.x_twitter import XTwitterAdapter
|
|
12
|
+
from phoenix.adapters.youtube import YouTubeAdapter
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"BaseAdapter",
|
|
16
|
+
"FacebookAdapter",
|
|
17
|
+
"GenericWebAdapter",
|
|
18
|
+
"InstagramAdapter",
|
|
19
|
+
"LinkedInAdapter",
|
|
20
|
+
"PluginInterface",
|
|
21
|
+
"ScraperPlugin",
|
|
22
|
+
"TikTokAdapter",
|
|
23
|
+
"XTwitterAdapter",
|
|
24
|
+
"YouTubeAdapter",
|
|
25
|
+
]
|
phoenix/adapters/base.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Abstract base class and plugin interface for Phoenix Engine adapters.
|
|
2
|
+
|
|
3
|
+
Adapters are self-contained platform scrapers that define how to collect,
|
|
4
|
+
extract, and normalize public HTML content for a specific site or family of
|
|
5
|
+
sites. The core engine discovers adapters through :class:`PluginLoader` and
|
|
6
|
+
routes URLs to them without modifying core code.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from typing import TYPE_CHECKING, Any
|
|
13
|
+
|
|
14
|
+
from bs4 import BeautifulSoup
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from re import Pattern
|
|
18
|
+
|
|
19
|
+
from phoenix.collectors.base import Collector
|
|
20
|
+
from phoenix.models.document import RawResponse
|
|
21
|
+
from phoenix.models.output import UnifiedOutput
|
|
22
|
+
from phoenix.options import CollectionOptions
|
|
23
|
+
from phoenix.plugins.manifest import PluginManifest
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BaseAdapter(ABC):
|
|
27
|
+
"""Abstract contract implemented by every Phoenix Engine adapter."""
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def manifest(self) -> PluginManifest:
|
|
32
|
+
"""Return the plugin manifest describing this adapter."""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def supported_patterns(self) -> list[Pattern[str]]:
|
|
36
|
+
"""Return compiled URL regex patterns handled by this adapter."""
|
|
37
|
+
|
|
38
|
+
def preferred_strategies(self) -> list[str]:
|
|
39
|
+
"""Return the ordered list of preferred collection strategies.
|
|
40
|
+
|
|
41
|
+
Defaults to ``["http", "browser"]`` so fast direct HTTP is attempted
|
|
42
|
+
first, with headless browser rendering as a fallback.
|
|
43
|
+
"""
|
|
44
|
+
return ["http", "browser"]
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
async def collect(
|
|
48
|
+
self,
|
|
49
|
+
url: str,
|
|
50
|
+
strategy: str,
|
|
51
|
+
collector: Collector,
|
|
52
|
+
options: CollectionOptions,
|
|
53
|
+
) -> RawResponse:
|
|
54
|
+
"""Collect raw HTML for ``url`` using the supplied ``collector``.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
url: Target URL to collect.
|
|
58
|
+
strategy: Strategy identifier selected for this collection.
|
|
59
|
+
collector: Concrete collector implementing the strategy.
|
|
60
|
+
options: Collection options such as timeout and archive flags.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
A raw response containing HTML, status, headers, and metadata.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
async def extract(self, raw_response: RawResponse) -> dict[str, Any]:
|
|
68
|
+
"""Extract structured platform-specific fields from ``raw_response``.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
raw_response: Raw HTML response produced by a collector.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Dictionary of extracted fields. The exact keys are platform-
|
|
75
|
+
specific; the adapter's ``normalize`` method maps them to the
|
|
76
|
+
unified schema.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
async def normalize(
|
|
81
|
+
self,
|
|
82
|
+
extracted: dict[str, Any],
|
|
83
|
+
url: str,
|
|
84
|
+
strategy: str,
|
|
85
|
+
) -> UnifiedOutput:
|
|
86
|
+
"""Convert extracted fields into the unified output schema.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
extracted: Dictionary returned by :meth:`extract`.
|
|
90
|
+
url: Normalized source URL.
|
|
91
|
+
strategy: Collection strategy that produced the raw response.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
A validated ``UnifiedOutput`` instance.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def health_check(self) -> dict[str, Any]:
|
|
98
|
+
"""Return adapter health and diagnostic metadata.
|
|
99
|
+
|
|
100
|
+
Subclasses may override this to include selector health, session
|
|
101
|
+
status, or other platform-specific diagnostics.
|
|
102
|
+
"""
|
|
103
|
+
return {
|
|
104
|
+
"adapter": self.manifest.name,
|
|
105
|
+
"version": self.manifest.version,
|
|
106
|
+
"platforms": self.manifest.platforms,
|
|
107
|
+
"url_patterns": len(self.manifest.url_patterns),
|
|
108
|
+
"strategies": self.preferred_strategies(),
|
|
109
|
+
"requires_auth": self.manifest.requires_auth,
|
|
110
|
+
"supports_ai_fallback": self.manifest.supports_ai_fallback,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
# ------------------------------------------------------------------
|
|
114
|
+
# Shared utility methods for adapter authors
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
def _is_public_content(self, html: str) -> bool:
|
|
118
|
+
"""Return ``True`` when ``html`` appears to be publicly accessible.
|
|
119
|
+
|
|
120
|
+
This is a heuristic guard used by adapters to avoid extracting data
|
|
121
|
+
from login walls or private/error pages. It looks for common phrases
|
|
122
|
+
that indicate authentication is required or the content is not public.
|
|
123
|
+
"""
|
|
124
|
+
text = BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)
|
|
125
|
+
text_lower = text.lower()
|
|
126
|
+
|
|
127
|
+
private_indicators = [
|
|
128
|
+
"log in to",
|
|
129
|
+
"login to",
|
|
130
|
+
"sign in to",
|
|
131
|
+
"signin to",
|
|
132
|
+
"authentication required",
|
|
133
|
+
"this content isn't available",
|
|
134
|
+
"this page isn't available",
|
|
135
|
+
"page not found",
|
|
136
|
+
"sorry, this page isn't available",
|
|
137
|
+
"please log in",
|
|
138
|
+
"please sign in",
|
|
139
|
+
"you must log in",
|
|
140
|
+
"you must sign in",
|
|
141
|
+
"members only",
|
|
142
|
+
"private group",
|
|
143
|
+
"this account is private",
|
|
144
|
+
"this profile is private",
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
return not any(indicator in text_lower for indicator in private_indicators)
|
|
148
|
+
|
|
149
|
+
def _extract_with_selectors(
|
|
150
|
+
self,
|
|
151
|
+
soup: BeautifulSoup,
|
|
152
|
+
selector_sets: dict[str, list[str]],
|
|
153
|
+
) -> dict[str, Any]:
|
|
154
|
+
"""Extract fields from ``soup`` using ordered selector fallback chains.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
soup: Parsed BeautifulSoup document.
|
|
158
|
+
selector_sets: Mapping of field name to ordered list of CSS
|
|
159
|
+
selectors. Each field is resolved by trying selectors in order
|
|
160
|
+
and returning the first successful match.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Dictionary with field names as keys. Values are dictionaries
|
|
164
|
+
containing ``value`` (extracted text or ``None``), ``selector_used``,
|
|
165
|
+
and ``matched``.
|
|
166
|
+
"""
|
|
167
|
+
results: dict[str, Any] = {}
|
|
168
|
+
for field, selectors in selector_sets.items():
|
|
169
|
+
value: str | None = None
|
|
170
|
+
selector_used: str | None = None
|
|
171
|
+
matched = False
|
|
172
|
+
for selector in selectors:
|
|
173
|
+
elements = soup.select(selector)
|
|
174
|
+
if elements:
|
|
175
|
+
first = elements[0]
|
|
176
|
+
value = first.get_text(strip=True) if first.name else str(first)
|
|
177
|
+
selector_used = selector
|
|
178
|
+
matched = True
|
|
179
|
+
break
|
|
180
|
+
results[field] = {
|
|
181
|
+
"value": value,
|
|
182
|
+
"selector_used": selector_used,
|
|
183
|
+
"matched": matched,
|
|
184
|
+
"confidence": 1.0 if matched else 0.0,
|
|
185
|
+
}
|
|
186
|
+
return results
|
|
187
|
+
|
|
188
|
+
def _parse_engagement(self, text: str | None) -> int | None:
|
|
189
|
+
"""Parse an engagement count string into an integer.
|
|
190
|
+
|
|
191
|
+
Supports counts with suffixes such as ``1.2K``, ``3M``, and comma
|
|
192
|
+
separators. Returns ``None`` for empty or unparseable input.
|
|
193
|
+
|
|
194
|
+
Examples:
|
|
195
|
+
``"1.2K"`` -> ``1200``
|
|
196
|
+
``"3M"`` -> ``3000000``
|
|
197
|
+
``"1,234"`` -> ``1234``
|
|
198
|
+
"""
|
|
199
|
+
if text is None:
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
cleaned = text.strip().lower().replace(",", "")
|
|
203
|
+
if not cleaned:
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
multiplier = 1
|
|
207
|
+
if cleaned.endswith("k"):
|
|
208
|
+
multiplier = 1_000
|
|
209
|
+
cleaned = cleaned[:-1]
|
|
210
|
+
elif cleaned.endswith("m"):
|
|
211
|
+
multiplier = 1_000_000
|
|
212
|
+
cleaned = cleaned[:-1]
|
|
213
|
+
elif cleaned.endswith("b"):
|
|
214
|
+
multiplier = 1_000_000_000
|
|
215
|
+
cleaned = cleaned[:-1]
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
return int(float(cleaned) * multiplier)
|
|
219
|
+
except ValueError:
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# ``PluginInterface`` is provided as a synonym for authors who prefer that name.
|
|
224
|
+
PluginInterface = BaseAdapter
|
|
225
|
+
|
|
226
|
+
# ``ScraperPlugin`` is the public name used in Architecture v2.0.0 / API Spec v2.0.0.
|
|
227
|
+
ScraperPlugin = BaseAdapter
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
__all__ = ["BaseAdapter", "PluginInterface", "ScraperPlugin"]
|