phoenix-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. phoenix/__init__.py +41 -0
  2. phoenix/__main__.py +8 -0
  3. phoenix/adapters/__init__.py +25 -0
  4. phoenix/adapters/base.py +230 -0
  5. phoenix/adapters/facebook.py +482 -0
  6. phoenix/adapters/generated/__init__.py +0 -0
  7. phoenix/adapters/generated/quotes_to_scrape.py +76 -0
  8. phoenix/adapters/generic.py +189 -0
  9. phoenix/adapters/instagram.py +426 -0
  10. phoenix/adapters/linkedin.py +542 -0
  11. phoenix/adapters/tiktok.py +557 -0
  12. phoenix/adapters/x_twitter.py +401 -0
  13. phoenix/adapters/youtube.py +544 -0
  14. phoenix/architect/__init__.py +15 -0
  15. phoenix/architect/coder.py +150 -0
  16. phoenix/architect/critic.py +324 -0
  17. phoenix/architect/explorer.py +232 -0
  18. phoenix/architect/fixture_generator.py +256 -0
  19. phoenix/architect/inspector.py +111 -0
  20. phoenix/architect/orchestrator.py +403 -0
  21. phoenix/architect/researcher.py +187 -0
  22. phoenix/architect/template_generator.py +145 -0
  23. phoenix/architect/writer.py +108 -0
  24. phoenix/cli/__init__.py +7 -0
  25. phoenix/cli/main.py +725 -0
  26. phoenix/collectors/__init__.py +17 -0
  27. phoenix/collectors/base.py +81 -0
  28. phoenix/collectors/browser.py +209 -0
  29. phoenix/collectors/browser_pool.py +197 -0
  30. phoenix/collectors/direct.py +132 -0
  31. phoenix/engine.py +257 -0
  32. phoenix/exceptions.py +77 -0
  33. phoenix/infrastructure/__init__.py +40 -0
  34. phoenix/infrastructure/audit_logger.py +68 -0
  35. phoenix/infrastructure/config.py +134 -0
  36. phoenix/infrastructure/license_manager.py +270 -0
  37. phoenix/infrastructure/rate_limiter.py +197 -0
  38. phoenix/infrastructure/session_manager.py +92 -0
  39. phoenix/infrastructure/storage.py +580 -0
  40. phoenix/infrastructure/vault.py +275 -0
  41. phoenix/intelligence/__init__.py +18 -0
  42. phoenix/intelligence/anti_bot_recovery.py +205 -0
  43. phoenix/intelligence/change_detector.py +314 -0
  44. phoenix/intelligence/classifier.py +179 -0
  45. phoenix/intelligence/entities.py +104 -0
  46. phoenix/intelligence/selector_health.py +139 -0
  47. phoenix/intelligence/selector_repair.py +35 -0
  48. phoenix/models/__init__.py +34 -0
  49. phoenix/models/classification.py +19 -0
  50. phoenix/models/config.py +207 -0
  51. phoenix/models/document.py +70 -0
  52. phoenix/models/output.py +182 -0
  53. phoenix/models/session.py +26 -0
  54. phoenix/models/strategy.py +28 -0
  55. phoenix/options.py +67 -0
  56. phoenix/pipeline.py +598 -0
  57. phoenix/plugins/__init__.py +9 -0
  58. phoenix/plugins/loader.py +266 -0
  59. phoenix/plugins/manifest.py +62 -0
  60. phoenix/plugins/registry.py +109 -0
  61. phoenix/processing/__init__.py +15 -0
  62. phoenix/processing/ai_assistant.py +101 -0
  63. phoenix/processing/archiver.py +124 -0
  64. phoenix/processing/domain_memory.py +304 -0
  65. phoenix/processing/html_extractor.py +79 -0
  66. phoenix/processing/normalizer.py +124 -0
  67. phoenix/processing/phoenix_ai_extractor.py +436 -0
  68. phoenix/py.typed +0 -0
  69. phoenix/router.py +304 -0
  70. phoenix/scrapers/__init__.py +33 -0
  71. phoenix/scrapers/base.py +13 -0
  72. phoenix/scrapers/browser.py +9 -0
  73. phoenix/scrapers/http.py +9 -0
  74. phoenix/scrapers/selector_engine.py +38 -0
  75. phoenix/stealth/__init__.py +21 -0
  76. phoenix/stealth/captcha.py +143 -0
  77. phoenix/stealth/humanizer.py +101 -0
  78. phoenix/stealth/profile.py +134 -0
  79. phoenix/stealth/rotator.py +87 -0
  80. phoenix/stealth/warming.py +56 -0
  81. phoenix/strategy_selector.py +145 -0
  82. phoenix/version.py +7 -0
  83. phoenix_engine-0.1.0.dist-info/METADATA +187 -0
  84. phoenix_engine-0.1.0.dist-info/RECORD +87 -0
  85. phoenix_engine-0.1.0.dist-info/WHEEL +5 -0
  86. phoenix_engine-0.1.0.dist-info/entry_points.txt +2 -0
  87. phoenix_engine-0.1.0.dist-info/top_level.txt +1 -0
phoenix/__init__.py ADDED
@@ -0,0 +1,41 @@
1
+ """Phoenix Engine -- universal pure web scraping engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from phoenix.adapters import (
6
+ BaseAdapter,
7
+ GenericWebAdapter,
8
+ PluginInterface,
9
+ ScraperPlugin,
10
+ )
11
+ from phoenix.engine import PhoenixEngine
12
+ from phoenix.models.output import (
13
+ CollectionResult,
14
+ ScrapingResult,
15
+ UnifiedOutput,
16
+ )
17
+ from phoenix.models.strategy import ScrapingStrategy
18
+
19
+ # Import options and core models before engine to prevent circular imports
20
+ # with router/pipeline submodules.
21
+ from phoenix.options import CollectionOptions, ScrapingOptions
22
+ from phoenix.plugins import PluginLoader, PluginManifest, PluginRegistry
23
+ from phoenix.version import __version__
24
+
25
+ __all__ = [
26
+ "BaseAdapter",
27
+ "CollectionOptions",
28
+ "CollectionResult",
29
+ "GenericWebAdapter",
30
+ "PhoenixEngine",
31
+ "PluginInterface",
32
+ "PluginLoader",
33
+ "PluginManifest",
34
+ "PluginRegistry",
35
+ "ScraperPlugin",
36
+ "ScrapingOptions",
37
+ "ScrapingResult",
38
+ "ScrapingStrategy",
39
+ "UnifiedOutput",
40
+ "__version__",
41
+ ]
phoenix/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Entry point for ``python -m phoenix``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from phoenix.cli.main import app
6
+
7
+ if __name__ == "__main__":
8
+ app()
@@ -0,0 +1,25 @@
1
+ """Platform adapters that parse HTML into structured data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from phoenix.adapters.base import BaseAdapter, PluginInterface, ScraperPlugin
6
+ from phoenix.adapters.facebook import FacebookAdapter
7
+ from phoenix.adapters.generic import GenericWebAdapter
8
+ from phoenix.adapters.instagram import InstagramAdapter
9
+ from phoenix.adapters.linkedin import LinkedInAdapter
10
+ from phoenix.adapters.tiktok import TikTokAdapter
11
+ from phoenix.adapters.x_twitter import XTwitterAdapter
12
+ from phoenix.adapters.youtube import YouTubeAdapter
13
+
14
+ __all__ = [
15
+ "BaseAdapter",
16
+ "FacebookAdapter",
17
+ "GenericWebAdapter",
18
+ "InstagramAdapter",
19
+ "LinkedInAdapter",
20
+ "PluginInterface",
21
+ "ScraperPlugin",
22
+ "TikTokAdapter",
23
+ "XTwitterAdapter",
24
+ "YouTubeAdapter",
25
+ ]
@@ -0,0 +1,230 @@
1
+ """Abstract base class and plugin interface for Phoenix Engine adapters.
2
+
3
+ Adapters are self-contained platform scrapers that define how to collect,
4
+ extract, and normalize public HTML content for a specific site or family of
5
+ sites. The core engine discovers adapters through :class:`PluginLoader` and
6
+ routes URLs to them without modifying core code.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from abc import ABC, abstractmethod
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ from bs4 import BeautifulSoup
15
+
16
+ if TYPE_CHECKING:
17
+ from re import Pattern
18
+
19
+ from phoenix.collectors.base import Collector
20
+ from phoenix.models.document import RawResponse
21
+ from phoenix.models.output import UnifiedOutput
22
+ from phoenix.options import CollectionOptions
23
+ from phoenix.plugins.manifest import PluginManifest
24
+
25
+
26
+ class BaseAdapter(ABC):
27
+ """Abstract contract implemented by every Phoenix Engine adapter."""
28
+
29
+ @property
30
+ @abstractmethod
31
+ def manifest(self) -> PluginManifest:
32
+ """Return the plugin manifest describing this adapter."""
33
+
34
+ @abstractmethod
35
+ def supported_patterns(self) -> list[Pattern[str]]:
36
+ """Return compiled URL regex patterns handled by this adapter."""
37
+
38
+ def preferred_strategies(self) -> list[str]:
39
+ """Return the ordered list of preferred collection strategies.
40
+
41
+ Defaults to ``["http", "browser"]`` so fast direct HTTP is attempted
42
+ first, with headless browser rendering as a fallback.
43
+ """
44
+ return ["http", "browser"]
45
+
46
+ @abstractmethod
47
+ async def collect(
48
+ self,
49
+ url: str,
50
+ strategy: str,
51
+ collector: Collector,
52
+ options: CollectionOptions,
53
+ ) -> RawResponse:
54
+ """Collect raw HTML for ``url`` using the supplied ``collector``.
55
+
56
+ Args:
57
+ url: Target URL to collect.
58
+ strategy: Strategy identifier selected for this collection.
59
+ collector: Concrete collector implementing the strategy.
60
+ options: Collection options such as timeout and archive flags.
61
+
62
+ Returns:
63
+ A raw response containing HTML, status, headers, and metadata.
64
+ """
65
+
66
+ @abstractmethod
67
+ async def extract(self, raw_response: RawResponse) -> dict[str, Any]:
68
+ """Extract structured platform-specific fields from ``raw_response``.
69
+
70
+ Args:
71
+ raw_response: Raw HTML response produced by a collector.
72
+
73
+ Returns:
74
+ Dictionary of extracted fields. The exact keys are platform-
75
+ specific; the adapter's ``normalize`` method maps them to the
76
+ unified schema.
77
+ """
78
+
79
+ @abstractmethod
80
+ async def normalize(
81
+ self,
82
+ extracted: dict[str, Any],
83
+ url: str,
84
+ strategy: str,
85
+ ) -> UnifiedOutput:
86
+ """Convert extracted fields into the unified output schema.
87
+
88
+ Args:
89
+ extracted: Dictionary returned by :meth:`extract`.
90
+ url: Normalized source URL.
91
+ strategy: Collection strategy that produced the raw response.
92
+
93
+ Returns:
94
+ A validated ``UnifiedOutput`` instance.
95
+ """
96
+
97
+ def health_check(self) -> dict[str, Any]:
98
+ """Return adapter health and diagnostic metadata.
99
+
100
+ Subclasses may override this to include selector health, session
101
+ status, or other platform-specific diagnostics.
102
+ """
103
+ return {
104
+ "adapter": self.manifest.name,
105
+ "version": self.manifest.version,
106
+ "platforms": self.manifest.platforms,
107
+ "url_patterns": len(self.manifest.url_patterns),
108
+ "strategies": self.preferred_strategies(),
109
+ "requires_auth": self.manifest.requires_auth,
110
+ "supports_ai_fallback": self.manifest.supports_ai_fallback,
111
+ }
112
+
113
+ # ------------------------------------------------------------------
114
+ # Shared utility methods for adapter authors
115
+ # ------------------------------------------------------------------
116
+
117
+ def _is_public_content(self, html: str) -> bool:
118
+ """Return ``True`` when ``html`` appears to be publicly accessible.
119
+
120
+ This is a heuristic guard used by adapters to avoid extracting data
121
+ from login walls or private/error pages. It looks for common phrases
122
+ that indicate authentication is required or the content is not public.
123
+ """
124
+ text = BeautifulSoup(html, "html.parser").get_text(separator=" ", strip=True)
125
+ text_lower = text.lower()
126
+
127
+ private_indicators = [
128
+ "log in to",
129
+ "login to",
130
+ "sign in to",
131
+ "signin to",
132
+ "authentication required",
133
+ "this content isn't available",
134
+ "this page isn't available",
135
+ "page not found",
136
+ "sorry, this page isn't available",
137
+ "please log in",
138
+ "please sign in",
139
+ "you must log in",
140
+ "you must sign in",
141
+ "members only",
142
+ "private group",
143
+ "this account is private",
144
+ "this profile is private",
145
+ ]
146
+
147
+ return not any(indicator in text_lower for indicator in private_indicators)
148
+
149
+ def _extract_with_selectors(
150
+ self,
151
+ soup: BeautifulSoup,
152
+ selector_sets: dict[str, list[str]],
153
+ ) -> dict[str, Any]:
154
+ """Extract fields from ``soup`` using ordered selector fallback chains.
155
+
156
+ Args:
157
+ soup: Parsed BeautifulSoup document.
158
+ selector_sets: Mapping of field name to ordered list of CSS
159
+ selectors. Each field is resolved by trying selectors in order
160
+ and returning the first successful match.
161
+
162
+ Returns:
163
+ Dictionary with field names as keys. Values are dictionaries
164
+ containing ``value`` (extracted text or ``None``), ``selector_used``,
165
+ and ``matched``.
166
+ """
167
+ results: dict[str, Any] = {}
168
+ for field, selectors in selector_sets.items():
169
+ value: str | None = None
170
+ selector_used: str | None = None
171
+ matched = False
172
+ for selector in selectors:
173
+ elements = soup.select(selector)
174
+ if elements:
175
+ first = elements[0]
176
+ value = first.get_text(strip=True) if first.name else str(first)
177
+ selector_used = selector
178
+ matched = True
179
+ break
180
+ results[field] = {
181
+ "value": value,
182
+ "selector_used": selector_used,
183
+ "matched": matched,
184
+ "confidence": 1.0 if matched else 0.0,
185
+ }
186
+ return results
187
+
188
+ def _parse_engagement(self, text: str | None) -> int | None:
189
+ """Parse an engagement count string into an integer.
190
+
191
+ Supports counts with suffixes such as ``1.2K``, ``3M``, and comma
192
+ separators. Returns ``None`` for empty or unparseable input.
193
+
194
+ Examples:
195
+ ``"1.2K"`` -> ``1200``
196
+ ``"3M"`` -> ``3000000``
197
+ ``"1,234"`` -> ``1234``
198
+ """
199
+ if text is None:
200
+ return None
201
+
202
+ cleaned = text.strip().lower().replace(",", "")
203
+ if not cleaned:
204
+ return None
205
+
206
+ multiplier = 1
207
+ if cleaned.endswith("k"):
208
+ multiplier = 1_000
209
+ cleaned = cleaned[:-1]
210
+ elif cleaned.endswith("m"):
211
+ multiplier = 1_000_000
212
+ cleaned = cleaned[:-1]
213
+ elif cleaned.endswith("b"):
214
+ multiplier = 1_000_000_000
215
+ cleaned = cleaned[:-1]
216
+
217
+ try:
218
+ return int(float(cleaned) * multiplier)
219
+ except ValueError:
220
+ return None
221
+
222
+
223
+ # ``PluginInterface`` is provided as a synonym for authors who prefer that name.
224
+ PluginInterface = BaseAdapter
225
+
226
+ # ``ScraperPlugin`` is the public name used in Architecture v2.0.0 / API Spec v2.0.0.
227
+ ScraperPlugin = BaseAdapter
228
+
229
+
230
+ __all__ = ["BaseAdapter", "PluginInterface", "ScraperPlugin"]