phoenix-engine 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoenix_engine-0.1.0/PKG-INFO +187 -0
- phoenix_engine-0.1.0/README.md +134 -0
- phoenix_engine-0.1.0/pyproject.toml +184 -0
- phoenix_engine-0.1.0/setup.cfg +4 -0
- phoenix_engine-0.1.0/src/phoenix/__init__.py +41 -0
- phoenix_engine-0.1.0/src/phoenix/__main__.py +8 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/__init__.py +25 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/base.py +230 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/facebook.py +482 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/generated/__init__.py +0 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/generated/quotes_to_scrape.py +76 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/generic.py +189 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/instagram.py +426 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/linkedin.py +542 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/tiktok.py +557 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/x_twitter.py +401 -0
- phoenix_engine-0.1.0/src/phoenix/adapters/youtube.py +544 -0
- phoenix_engine-0.1.0/src/phoenix/architect/__init__.py +15 -0
- phoenix_engine-0.1.0/src/phoenix/architect/coder.py +150 -0
- phoenix_engine-0.1.0/src/phoenix/architect/critic.py +324 -0
- phoenix_engine-0.1.0/src/phoenix/architect/explorer.py +232 -0
- phoenix_engine-0.1.0/src/phoenix/architect/fixture_generator.py +256 -0
- phoenix_engine-0.1.0/src/phoenix/architect/inspector.py +111 -0
- phoenix_engine-0.1.0/src/phoenix/architect/orchestrator.py +403 -0
- phoenix_engine-0.1.0/src/phoenix/architect/researcher.py +187 -0
- phoenix_engine-0.1.0/src/phoenix/architect/template_generator.py +145 -0
- phoenix_engine-0.1.0/src/phoenix/architect/writer.py +108 -0
- phoenix_engine-0.1.0/src/phoenix/cli/__init__.py +7 -0
- phoenix_engine-0.1.0/src/phoenix/cli/main.py +725 -0
- phoenix_engine-0.1.0/src/phoenix/collectors/__init__.py +17 -0
- phoenix_engine-0.1.0/src/phoenix/collectors/base.py +81 -0
- phoenix_engine-0.1.0/src/phoenix/collectors/browser.py +209 -0
- phoenix_engine-0.1.0/src/phoenix/collectors/browser_pool.py +197 -0
- phoenix_engine-0.1.0/src/phoenix/collectors/direct.py +132 -0
- phoenix_engine-0.1.0/src/phoenix/engine.py +257 -0
- phoenix_engine-0.1.0/src/phoenix/exceptions.py +77 -0
- phoenix_engine-0.1.0/src/phoenix/infrastructure/__init__.py +40 -0
- phoenix_engine-0.1.0/src/phoenix/infrastructure/audit_logger.py +68 -0
- phoenix_engine-0.1.0/src/phoenix/infrastructure/config.py +134 -0
- phoenix_engine-0.1.0/src/phoenix/infrastructure/license_manager.py +270 -0
- phoenix_engine-0.1.0/src/phoenix/infrastructure/rate_limiter.py +197 -0
- phoenix_engine-0.1.0/src/phoenix/infrastructure/session_manager.py +92 -0
- phoenix_engine-0.1.0/src/phoenix/infrastructure/storage.py +580 -0
- phoenix_engine-0.1.0/src/phoenix/infrastructure/vault.py +275 -0
- phoenix_engine-0.1.0/src/phoenix/intelligence/__init__.py +18 -0
- phoenix_engine-0.1.0/src/phoenix/intelligence/anti_bot_recovery.py +205 -0
- phoenix_engine-0.1.0/src/phoenix/intelligence/change_detector.py +314 -0
- phoenix_engine-0.1.0/src/phoenix/intelligence/classifier.py +179 -0
- phoenix_engine-0.1.0/src/phoenix/intelligence/entities.py +104 -0
- phoenix_engine-0.1.0/src/phoenix/intelligence/selector_health.py +139 -0
- phoenix_engine-0.1.0/src/phoenix/intelligence/selector_repair.py +35 -0
- phoenix_engine-0.1.0/src/phoenix/models/__init__.py +34 -0
- phoenix_engine-0.1.0/src/phoenix/models/classification.py +19 -0
- phoenix_engine-0.1.0/src/phoenix/models/config.py +207 -0
- phoenix_engine-0.1.0/src/phoenix/models/document.py +70 -0
- phoenix_engine-0.1.0/src/phoenix/models/output.py +182 -0
- phoenix_engine-0.1.0/src/phoenix/models/session.py +26 -0
- phoenix_engine-0.1.0/src/phoenix/models/strategy.py +28 -0
- phoenix_engine-0.1.0/src/phoenix/options.py +67 -0
- phoenix_engine-0.1.0/src/phoenix/pipeline.py +598 -0
- phoenix_engine-0.1.0/src/phoenix/plugins/__init__.py +9 -0
- phoenix_engine-0.1.0/src/phoenix/plugins/loader.py +266 -0
- phoenix_engine-0.1.0/src/phoenix/plugins/manifest.py +62 -0
- phoenix_engine-0.1.0/src/phoenix/plugins/registry.py +109 -0
- phoenix_engine-0.1.0/src/phoenix/processing/__init__.py +15 -0
- phoenix_engine-0.1.0/src/phoenix/processing/ai_assistant.py +101 -0
- phoenix_engine-0.1.0/src/phoenix/processing/archiver.py +124 -0
- phoenix_engine-0.1.0/src/phoenix/processing/domain_memory.py +304 -0
- phoenix_engine-0.1.0/src/phoenix/processing/html_extractor.py +79 -0
- phoenix_engine-0.1.0/src/phoenix/processing/normalizer.py +124 -0
- phoenix_engine-0.1.0/src/phoenix/processing/phoenix_ai_extractor.py +436 -0
- phoenix_engine-0.1.0/src/phoenix/py.typed +0 -0
- phoenix_engine-0.1.0/src/phoenix/router.py +304 -0
- phoenix_engine-0.1.0/src/phoenix/scrapers/__init__.py +33 -0
- phoenix_engine-0.1.0/src/phoenix/scrapers/base.py +13 -0
- phoenix_engine-0.1.0/src/phoenix/scrapers/browser.py +9 -0
- phoenix_engine-0.1.0/src/phoenix/scrapers/http.py +9 -0
- phoenix_engine-0.1.0/src/phoenix/scrapers/selector_engine.py +38 -0
- phoenix_engine-0.1.0/src/phoenix/stealth/__init__.py +21 -0
- phoenix_engine-0.1.0/src/phoenix/stealth/captcha.py +143 -0
- phoenix_engine-0.1.0/src/phoenix/stealth/humanizer.py +101 -0
- phoenix_engine-0.1.0/src/phoenix/stealth/profile.py +134 -0
- phoenix_engine-0.1.0/src/phoenix/stealth/rotator.py +87 -0
- phoenix_engine-0.1.0/src/phoenix/stealth/warming.py +56 -0
- phoenix_engine-0.1.0/src/phoenix/strategy_selector.py +145 -0
- phoenix_engine-0.1.0/src/phoenix/version.py +7 -0
- phoenix_engine-0.1.0/src/phoenix_engine.egg-info/PKG-INFO +187 -0
- phoenix_engine-0.1.0/src/phoenix_engine.egg-info/SOURCES.txt +90 -0
- phoenix_engine-0.1.0/src/phoenix_engine.egg-info/dependency_links.txt +1 -0
- phoenix_engine-0.1.0/src/phoenix_engine.egg-info/entry_points.txt +2 -0
- phoenix_engine-0.1.0/src/phoenix_engine.egg-info/requires.txt +32 -0
- phoenix_engine-0.1.0/src/phoenix_engine.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: phoenix-engine
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Universal pure web scraping engine.
|
|
5
|
+
Author: Phoenix Engine Team
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/phnx-tech/phoenix-engine
|
|
8
|
+
Project-URL: Documentation, https://github.com/phnx-tech/phoenix-engine#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/phnx-tech/phoenix-engine.git
|
|
10
|
+
Project-URL: Issues, https://github.com/phnx-tech/phoenix-engine/issues
|
|
11
|
+
Keywords: scraping,web,social-media,html,automation
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: httpx>=0.27.0
|
|
23
|
+
Requires-Dist: playwright>=1.40.0
|
|
24
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
25
|
+
Requires-Dist: lxml>=5.0.0
|
|
26
|
+
Requires-Dist: cssselect>=1.2.0
|
|
27
|
+
Requires-Dist: pydantic>=2.0.0
|
|
28
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
29
|
+
Requires-Dist: typer>=0.12.0
|
|
30
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
31
|
+
Requires-Dist: alembic>=1.13.0
|
|
32
|
+
Requires-Dist: keyring>=24.0.0
|
|
33
|
+
Requires-Dist: cryptography>=42.0.0
|
|
34
|
+
Requires-Dist: structlog>=24.0.0
|
|
35
|
+
Requires-Dist: openai>=1.0.0
|
|
36
|
+
Requires-Dist: duckduckgo-search>=8.0.0
|
|
37
|
+
Requires-Dist: pyyaml>=6.0
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
|
|
43
|
+
Requires-Dist: respx>=0.21.0; extra == "dev"
|
|
44
|
+
Requires-Dist: pytest-playwright>=0.5.0; extra == "dev"
|
|
45
|
+
Requires-Dist: black>=24.0.0; extra == "dev"
|
|
46
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
47
|
+
Requires-Dist: mypy>=1.10.0; extra == "dev"
|
|
48
|
+
Requires-Dist: pre-commit>=3.7.0; extra == "dev"
|
|
49
|
+
Requires-Dist: faker>=25.0.0; extra == "dev"
|
|
50
|
+
Requires-Dist: factory-boy>=3.3.0; extra == "dev"
|
|
51
|
+
Requires-Dist: mkdocs>=1.6.0; extra == "dev"
|
|
52
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == "dev"
|
|
53
|
+
|
|
54
|
+
# Phoenix Engine
|
|
55
|
+
|
|
56
|
+
A universal pure-web scraping engine that turns public web pages into structured,
|
|
57
|
+
predictable data. No official APIs required — Phoenix Engine uses raw HTTP
|
|
58
|
+
requests and headless browser automation to extract posts, profiles, listings,
|
|
59
|
+
and articles from social platforms and websites.
|
|
60
|
+
|
|
61
|
+
> **Current status:** beta / early access. A license key is required to run the
|
|
62
|
+
> engine in production.
|
|
63
|
+
|
|
64
|
+
## What it does
|
|
65
|
+
|
|
66
|
+
- Scrapes public pages using **HTTP** or **headless browser** strategies.
|
|
67
|
+
- Returns a **unified JSON output** no matter what platform you target.
|
|
68
|
+
- Automatically adapts to site changes, anti-bot measures, and selector drift.
|
|
69
|
+
- Learns from past scrapes to pick the best strategy for each domain.
|
|
70
|
+
- Can be used from the command line or inside your Python application.
|
|
71
|
+
|
|
72
|
+
## Install
|
|
73
|
+
|
|
74
|
+
### From PyPI
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install phoenix-engine
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### From a GitHub Release wheel
|
|
81
|
+
|
|
82
|
+
Download the `.whl` from the latest release, then:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install phoenix_engine-0.1.0-py3-none-any.whl
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Activate your license
|
|
89
|
+
|
|
90
|
+
Phoenix Engine is distributed under license keys during beta. After installing,
|
|
91
|
+
set your key:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
export PHOENIX_LICENSE_ENFORCEMENT_ENABLED=true
|
|
95
|
+
export PHOENIX_LICENSE_SECRET="your-signing-secret"
|
|
96
|
+
export PHOENIX_LICENSE_KEY="phx.eyJ..."
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Or create a `phoenix.yaml` file:
|
|
100
|
+
|
|
101
|
+
```yaml
|
|
102
|
+
license_enforcement_enabled: true
|
|
103
|
+
license_secret: "your-signing-secret"
|
|
104
|
+
license_key: "phx.eyJ..."
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
If the key is missing, expired, tampered with, or over its use limit, the
|
|
108
|
+
engine will refuse to start.
|
|
109
|
+
|
|
110
|
+
## Quick start — CLI
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
# Scrape a single public page
|
|
114
|
+
phoenix scrape "https://example.com/post/123"
|
|
115
|
+
|
|
116
|
+
# Scrape without archiving the raw source
|
|
117
|
+
phoenix scrape "https://example.com/post/123" --no-archive
|
|
118
|
+
|
|
119
|
+
# Scrape multiple URLs in parallel
|
|
120
|
+
phoenix scrape-batch \
|
|
121
|
+
"https://example.com/post/123" \
|
|
122
|
+
"https://example.com/post/456" \
|
|
123
|
+
--output results.json
|
|
124
|
+
|
|
125
|
+
# List built-in platform adapters
|
|
126
|
+
phoenix plugins list
|
|
127
|
+
|
|
128
|
+
# Inspect effective configuration (secrets are masked)
|
|
129
|
+
phoenix config show
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Quick start — Python library
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
import asyncio
|
|
136
|
+
from phoenix import PhoenixEngine
|
|
137
|
+
|
|
138
|
+
async def main() -> None:
|
|
139
|
+
async with PhoenixEngine() as engine:
|
|
140
|
+
result = await engine.scrape("https://example.com/post/123")
|
|
141
|
+
print(result.output.model_dump_json(indent=2))
|
|
142
|
+
|
|
143
|
+
asyncio.run(main())
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Configuration
|
|
147
|
+
|
|
148
|
+
Most settings can be controlled with environment variables or a config file
|
|
149
|
+
(`phoenix.yaml`, `phoenix.json`, `phoenix.toml`):
|
|
150
|
+
|
|
151
|
+
```yaml
|
|
152
|
+
timeout: 30
|
|
153
|
+
stealth_enabled: true
|
|
154
|
+
ai_enabled: false
|
|
155
|
+
rate_limits:
|
|
156
|
+
example.com: 1.0
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Run `phoenix config show` to see the active configuration.
|
|
160
|
+
|
|
161
|
+
## Supported platforms
|
|
162
|
+
|
|
163
|
+
Phoenix Engine ships with adapters for common public platforms and a generic
|
|
164
|
+
fallback for any HTML page:
|
|
165
|
+
|
|
166
|
+
- Instagram, Facebook, X/Twitter, LinkedIn, TikTok, YouTube
|
|
167
|
+
- Generic blogs, listings, and article pages
|
|
168
|
+
|
|
169
|
+
Adapters are plugin-based, so new platforms can be added without touching the
|
|
170
|
+
core engine.
|
|
171
|
+
|
|
172
|
+
## Ethical use
|
|
173
|
+
|
|
174
|
+
Phoenix Engine only scrapes **publicly available** content. Always respect:
|
|
175
|
+
|
|
176
|
+
- The target site's `robots.txt` and Terms of Service.
|
|
177
|
+
- Local laws and data-protection regulations (GDPR, CCPA, etc.).
|
|
178
|
+
- Rate limits — the engine includes built-in throttling to avoid overload.
|
|
179
|
+
|
|
180
|
+
## Support
|
|
181
|
+
|
|
182
|
+
- Issues: https://github.com/phnx-tech/phoenix-engine/issues
|
|
183
|
+
- Repository: https://github.com/phnx-tech/phoenix-engine
|
|
184
|
+
|
|
185
|
+
## License
|
|
186
|
+
|
|
187
|
+
Commercial beta license. See your license agreement for terms.
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# Phoenix Engine
|
|
2
|
+
|
|
3
|
+
A universal pure-web scraping engine that turns public web pages into structured,
|
|
4
|
+
predictable data. No official APIs required — Phoenix Engine uses raw HTTP
|
|
5
|
+
requests and headless browser automation to extract posts, profiles, listings,
|
|
6
|
+
and articles from social platforms and websites.
|
|
7
|
+
|
|
8
|
+
> **Current status:** beta / early access. A license key is required to run the
|
|
9
|
+
> engine in production.
|
|
10
|
+
|
|
11
|
+
## What it does
|
|
12
|
+
|
|
13
|
+
- Scrapes public pages using **HTTP** or **headless browser** strategies.
|
|
14
|
+
- Returns a **unified JSON output** no matter what platform you target.
|
|
15
|
+
- Automatically adapts to site changes, anti-bot measures, and selector drift.
|
|
16
|
+
- Learns from past scrapes to pick the best strategy for each domain.
|
|
17
|
+
- Can be used from the command line or inside your Python application.
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
### From PyPI
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install phoenix-engine
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### From a GitHub Release wheel
|
|
28
|
+
|
|
29
|
+
Download the `.whl` from the latest release, then:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install phoenix_engine-0.1.0-py3-none-any.whl
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Activate your license
|
|
36
|
+
|
|
37
|
+
Phoenix Engine is distributed under license keys during beta. After installing,
|
|
38
|
+
set your key:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
export PHOENIX_LICENSE_ENFORCEMENT_ENABLED=true
|
|
42
|
+
export PHOENIX_LICENSE_SECRET="your-signing-secret"
|
|
43
|
+
export PHOENIX_LICENSE_KEY="phx.eyJ..."
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or create a `phoenix.yaml` file:
|
|
47
|
+
|
|
48
|
+
```yaml
|
|
49
|
+
license_enforcement_enabled: true
|
|
50
|
+
license_secret: "your-signing-secret"
|
|
51
|
+
license_key: "phx.eyJ..."
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
If the key is missing, expired, tampered with, or over its use limit, the
|
|
55
|
+
engine will refuse to start.
|
|
56
|
+
|
|
57
|
+
## Quick start — CLI
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Scrape a single public page
|
|
61
|
+
phoenix scrape "https://example.com/post/123"
|
|
62
|
+
|
|
63
|
+
# Scrape without archiving the raw source
|
|
64
|
+
phoenix scrape "https://example.com/post/123" --no-archive
|
|
65
|
+
|
|
66
|
+
# Scrape multiple URLs in parallel
|
|
67
|
+
phoenix scrape-batch \
|
|
68
|
+
"https://example.com/post/123" \
|
|
69
|
+
"https://example.com/post/456" \
|
|
70
|
+
--output results.json
|
|
71
|
+
|
|
72
|
+
# List built-in platform adapters
|
|
73
|
+
phoenix plugins list
|
|
74
|
+
|
|
75
|
+
# Inspect effective configuration (secrets are masked)
|
|
76
|
+
phoenix config show
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Quick start — Python library
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
import asyncio
|
|
83
|
+
from phoenix import PhoenixEngine
|
|
84
|
+
|
|
85
|
+
async def main() -> None:
|
|
86
|
+
async with PhoenixEngine() as engine:
|
|
87
|
+
result = await engine.scrape("https://example.com/post/123")
|
|
88
|
+
print(result.output.model_dump_json(indent=2))
|
|
89
|
+
|
|
90
|
+
asyncio.run(main())
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Configuration
|
|
94
|
+
|
|
95
|
+
Most settings can be controlled with environment variables or a config file
|
|
96
|
+
(`phoenix.yaml`, `phoenix.json`, `phoenix.toml`):
|
|
97
|
+
|
|
98
|
+
```yaml
|
|
99
|
+
timeout: 30
|
|
100
|
+
stealth_enabled: true
|
|
101
|
+
ai_enabled: false
|
|
102
|
+
rate_limits:
|
|
103
|
+
example.com: 1.0
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Run `phoenix config show` to see the active configuration.
|
|
107
|
+
|
|
108
|
+
## Supported platforms
|
|
109
|
+
|
|
110
|
+
Phoenix Engine ships with adapters for common public platforms and a generic
|
|
111
|
+
fallback for any HTML page:
|
|
112
|
+
|
|
113
|
+
- Instagram, Facebook, X/Twitter, LinkedIn, TikTok, YouTube
|
|
114
|
+
- Generic blogs, listings, and article pages
|
|
115
|
+
|
|
116
|
+
Adapters are plugin-based, so new platforms can be added without touching the
|
|
117
|
+
core engine.
|
|
118
|
+
|
|
119
|
+
## Ethical use
|
|
120
|
+
|
|
121
|
+
Phoenix Engine only scrapes **publicly available** content. Always respect:
|
|
122
|
+
|
|
123
|
+
- The target site's `robots.txt` and Terms of Service.
|
|
124
|
+
- Local laws and data-protection regulations (GDPR, CCPA, etc.).
|
|
125
|
+
- Rate limits — the engine includes built-in throttling to avoid overload.
|
|
126
|
+
|
|
127
|
+
## Support
|
|
128
|
+
|
|
129
|
+
- Issues: https://github.com/phnx-tech/phoenix-engine/issues
|
|
130
|
+
- Repository: https://github.com/phnx-tech/phoenix-engine
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
Commercial beta license. See your license agreement for terms.
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "phoenix-engine"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Universal pure web scraping engine."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Phoenix Engine Team" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["scraping", "web", "social-media", "html", "automation"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"httpx>=0.27.0",
|
|
28
|
+
"playwright>=1.40.0",
|
|
29
|
+
"beautifulsoup4>=4.12.0",
|
|
30
|
+
"lxml>=5.0.0",
|
|
31
|
+
"cssselect>=1.2.0",
|
|
32
|
+
"pydantic>=2.0.0",
|
|
33
|
+
"pydantic-settings>=2.0.0",
|
|
34
|
+
"typer>=0.12.0",
|
|
35
|
+
"sqlalchemy>=2.0.0",
|
|
36
|
+
"alembic>=1.13.0",
|
|
37
|
+
"keyring>=24.0.0",
|
|
38
|
+
"cryptography>=42.0.0",
|
|
39
|
+
"structlog>=24.0.0",
|
|
40
|
+
"openai>=1.0.0",
|
|
41
|
+
"duckduckgo-search>=8.0.0",
|
|
42
|
+
"pyyaml>=6.0",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.optional-dependencies]
|
|
46
|
+
dev = [
|
|
47
|
+
"build>=1.0.0",
|
|
48
|
+
"pytest>=8.0.0",
|
|
49
|
+
"pytest-asyncio>=0.23.0",
|
|
50
|
+
"pytest-cov>=5.0.0",
|
|
51
|
+
"respx>=0.21.0",
|
|
52
|
+
"pytest-playwright>=0.5.0",
|
|
53
|
+
"black>=24.0.0",
|
|
54
|
+
"ruff>=0.4.0",
|
|
55
|
+
"mypy>=1.10.0",
|
|
56
|
+
"pre-commit>=3.7.0",
|
|
57
|
+
"faker>=25.0.0",
|
|
58
|
+
"factory-boy>=3.3.0",
|
|
59
|
+
"mkdocs>=1.6.0",
|
|
60
|
+
"mkdocs-material>=9.5.0",
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
[project.scripts]
|
|
64
|
+
phoenix = "phoenix.cli.main:app"
|
|
65
|
+
|
|
66
|
+
[project.urls]
|
|
67
|
+
Homepage = "https://github.com/phnx-tech/phoenix-engine"
|
|
68
|
+
Documentation = "https://github.com/phnx-tech/phoenix-engine#readme"
|
|
69
|
+
Repository = "https://github.com/phnx-tech/phoenix-engine.git"
|
|
70
|
+
Issues = "https://github.com/phnx-tech/phoenix-engine/issues"
|
|
71
|
+
|
|
72
|
+
[tool.setuptools.packages.find]
|
|
73
|
+
where = ["src"]
|
|
74
|
+
|
|
75
|
+
[tool.black]
|
|
76
|
+
line-length = 100
|
|
77
|
+
target-version = ["py311", "py312", "py313"]
|
|
78
|
+
include = "\\.pyi?$"
|
|
79
|
+
extend-exclude = """
|
|
80
|
+
/(
|
|
81
|
+
migrations
|
|
82
|
+
| archive
|
|
83
|
+
| \\.venv
|
|
84
|
+
)/
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
[tool.ruff]
|
|
88
|
+
target-version = "py311"
|
|
89
|
+
line-length = 100
|
|
90
|
+
|
|
91
|
+
[tool.ruff.lint]
|
|
92
|
+
select = ["ALL"]
|
|
93
|
+
ignore = [
|
|
94
|
+
"D105",
|
|
95
|
+
"D107",
|
|
96
|
+
"CPY001",
|
|
97
|
+
"TD003",
|
|
98
|
+
"FIX002",
|
|
99
|
+
"FBT003",
|
|
100
|
+
"ERA001",
|
|
101
|
+
"EM101",
|
|
102
|
+
"EM102",
|
|
103
|
+
"TRY003",
|
|
104
|
+
"D407",
|
|
105
|
+
"D413",
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
[tool.ruff.lint.pydocstyle]
|
|
109
|
+
convention = "google"
|
|
110
|
+
|
|
111
|
+
[tool.ruff.lint.mccabe]
|
|
112
|
+
max-complexity = 12
|
|
113
|
+
|
|
114
|
+
[tool.ruff.lint.per-file-ignores]
|
|
115
|
+
"tests/*" = [
|
|
116
|
+
"S101",
|
|
117
|
+
"D100",
|
|
118
|
+
"D103",
|
|
119
|
+
"SLF001",
|
|
120
|
+
"ARG001",
|
|
121
|
+
"PLR2004",
|
|
122
|
+
"PLC0415",
|
|
123
|
+
]
|
|
124
|
+
"tests/fixtures/*" = ["ALL"]
|
|
125
|
+
"*/__init__.py" = ["D104"]
|
|
126
|
+
"src/phoenix/adapters/generated/*.py" = ["ALL"]
|
|
127
|
+
"src/phoenix/cli/main.py" = ["ARG001", "B008", "FBT001", "PLR0913", "TC003"]
|
|
128
|
+
"src/phoenix/pipeline.py" = ["ANN401", "BLE001", "S112"]
|
|
129
|
+
"src/phoenix/adapters/base.py" = ["F401", "RUF001", "TC001", "UP035"]
|
|
130
|
+
"src/phoenix/adapters/generic.py" = ["ARG002"]
|
|
131
|
+
"src/phoenix/router.py" = ["PLC0415"]
|
|
132
|
+
"src/phoenix/plugins/loader.py" = ["ANN401", "PLC0415", "TC003", "TRY004", "TRY300"]
|
|
133
|
+
"src/phoenix/plugins/registry.py" = ["PYI034", "SLF001", "TC003"]
|
|
134
|
+
"tests/unit/test_plugin_loader.py" = ["ANN401", "ARG002", "D102", "PLW0108", "Q001", "TC001", "W605"]
|
|
135
|
+
"tests/unit/test_plugin_interface.py" = ["ANN401", "ARG002", "D102"]
|
|
136
|
+
|
|
137
|
+
[tool.mypy]
|
|
138
|
+
python_version = "3.11"
|
|
139
|
+
strict = true
|
|
140
|
+
warn_return_any = true
|
|
141
|
+
warn_unused_ignores = true
|
|
142
|
+
warn_unreachable = true
|
|
143
|
+
disallow_untyped_defs = true
|
|
144
|
+
disallow_incomplete_defs = true
|
|
145
|
+
check_untyped_defs = true
|
|
146
|
+
no_implicit_optional = true
|
|
147
|
+
warn_redundant_casts = true
|
|
148
|
+
warn_unused_configs = true
|
|
149
|
+
show_error_codes = true
|
|
150
|
+
|
|
151
|
+
[tool.pytest.ini_options]
|
|
152
|
+
asyncio_mode = "auto"
|
|
153
|
+
testpaths = ["tests"]
|
|
154
|
+
markers = [
|
|
155
|
+
"integration: marks tests as integration tests",
|
|
156
|
+
]
|
|
157
|
+
addopts = "-ra"
|
|
158
|
+
|
|
159
|
+
[tool.coverage.run]
|
|
160
|
+
source = ["phoenix"]
|
|
161
|
+
branch = true
|
|
162
|
+
|
|
163
|
+
[tool.coverage.report]
|
|
164
|
+
fail_under = 85
|
|
165
|
+
skip_covered = false
|
|
166
|
+
show_missing = true
|
|
167
|
+
include = [
|
|
168
|
+
"src/phoenix/adapters/*.py",
|
|
169
|
+
"src/phoenix/collectors/*",
|
|
170
|
+
"src/phoenix/infrastructure/*.py",
|
|
171
|
+
"src/phoenix/models/*",
|
|
172
|
+
"src/phoenix/options.py",
|
|
173
|
+
"src/phoenix/exceptions.py",
|
|
174
|
+
"src/phoenix/plugins/*.py",
|
|
175
|
+
"src/phoenix/router.py",
|
|
176
|
+
"src/phoenix/version.py",
|
|
177
|
+
"src/phoenix/processing/ai_assistant.py",
|
|
178
|
+
"src/phoenix/processing/archiver.py",
|
|
179
|
+
"src/phoenix/processing/phoenix_ai_extractor.py",
|
|
180
|
+
"src/phoenix/intelligence/*.py",
|
|
181
|
+
"src/phoenix/pipeline.py",
|
|
182
|
+
"src/phoenix/engine.py",
|
|
183
|
+
"src/phoenix/scrapers/*.py",
|
|
184
|
+
]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Phoenix Engine -- universal pure web scraping engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from phoenix.adapters import (
|
|
6
|
+
BaseAdapter,
|
|
7
|
+
GenericWebAdapter,
|
|
8
|
+
PluginInterface,
|
|
9
|
+
ScraperPlugin,
|
|
10
|
+
)
|
|
11
|
+
from phoenix.engine import PhoenixEngine
|
|
12
|
+
from phoenix.models.output import (
|
|
13
|
+
CollectionResult,
|
|
14
|
+
ScrapingResult,
|
|
15
|
+
UnifiedOutput,
|
|
16
|
+
)
|
|
17
|
+
from phoenix.models.strategy import ScrapingStrategy
|
|
18
|
+
|
|
19
|
+
# Import options and core models before engine to prevent circular imports
|
|
20
|
+
# with router/pipeline submodules.
|
|
21
|
+
from phoenix.options import CollectionOptions, ScrapingOptions
|
|
22
|
+
from phoenix.plugins import PluginLoader, PluginManifest, PluginRegistry
|
|
23
|
+
from phoenix.version import __version__
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"BaseAdapter",
|
|
27
|
+
"CollectionOptions",
|
|
28
|
+
"CollectionResult",
|
|
29
|
+
"GenericWebAdapter",
|
|
30
|
+
"PhoenixEngine",
|
|
31
|
+
"PluginInterface",
|
|
32
|
+
"PluginLoader",
|
|
33
|
+
"PluginManifest",
|
|
34
|
+
"PluginRegistry",
|
|
35
|
+
"ScraperPlugin",
|
|
36
|
+
"ScrapingOptions",
|
|
37
|
+
"ScrapingResult",
|
|
38
|
+
"ScrapingStrategy",
|
|
39
|
+
"UnifiedOutput",
|
|
40
|
+
"__version__",
|
|
41
|
+
]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Platform adapters that parse HTML into structured data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from phoenix.adapters.base import BaseAdapter, PluginInterface, ScraperPlugin
|
|
6
|
+
from phoenix.adapters.facebook import FacebookAdapter
|
|
7
|
+
from phoenix.adapters.generic import GenericWebAdapter
|
|
8
|
+
from phoenix.adapters.instagram import InstagramAdapter
|
|
9
|
+
from phoenix.adapters.linkedin import LinkedInAdapter
|
|
10
|
+
from phoenix.adapters.tiktok import TikTokAdapter
|
|
11
|
+
from phoenix.adapters.x_twitter import XTwitterAdapter
|
|
12
|
+
from phoenix.adapters.youtube import YouTubeAdapter
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"BaseAdapter",
|
|
16
|
+
"FacebookAdapter",
|
|
17
|
+
"GenericWebAdapter",
|
|
18
|
+
"InstagramAdapter",
|
|
19
|
+
"LinkedInAdapter",
|
|
20
|
+
"PluginInterface",
|
|
21
|
+
"ScraperPlugin",
|
|
22
|
+
"TikTokAdapter",
|
|
23
|
+
"XTwitterAdapter",
|
|
24
|
+
"YouTubeAdapter",
|
|
25
|
+
]
|