unrealon 1.1.0__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon-1.1.1/PKG-INFO +722 -0
- unrealon-1.1.1/README.md +643 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/pyproject.toml +1 -1
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/__init__.py +1 -1
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/__init__.py +1 -1
- unrealon-1.1.0/PKG-INFO +0 -164
- unrealon-1.1.0/README.md +0 -85
- {unrealon-1.1.0 → unrealon-1.1.1}/.gitignore +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/LICENSE +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/MANIFEST.in +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/requirements-dev.txt +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/requirements-test.txt +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/requirements.txt +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/cli.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/base.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/commands.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/connection.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/events.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/health.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/html_parser.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/logging.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/proxy.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/scheduler.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/client/session.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/configs/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/configs/bridge_config.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/configs/bridge_config.yaml +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/base.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/command.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/events.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/html_parser.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/logging.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/parser.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/proxy.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/requests.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/responses.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/scheduler.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/models/session.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/base.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/handlers/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/handlers/command.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/handlers/html_parser.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/handlers/logging.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/handlers/parser.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/handlers/proxy.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/handlers/scheduler.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_bridge/server/handlers/session.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/README.md +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/cli/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/cli/browser_cli.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/cli/cookies_cli.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/cli/interactive_mode.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/cli/main.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/core/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/core/browser_manager.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/dto/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/dto/models/config.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/dto/models/core.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/dto/models/dataclasses.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/dto/models/detection.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/dto/models/enums.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/dto/models/statistics.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/managers/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/managers/captcha.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/managers/cookies.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/managers/logger_bridge.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/managers/profile.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_browser/managers/stealth.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/browser/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/browser/config.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/browser/manager.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/exceptions.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/cli_manager.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/daemon_manager.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/managers/__init__.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/managers/browser.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/managers/config.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/managers/error.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/managers/html.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/managers/logging.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/managers/result.py +0 -0
- {unrealon-1.1.0 → unrealon-1.1.1}/src/unrealon_driver/parser/parser_manager.py +0 -0
unrealon-1.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,722 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: unrealon
|
|
3
|
+
Version: 1.1.1
|
|
4
|
+
Summary: Advanced browser automation framework with WebSocket bridge for distributed web scraping
|
|
5
|
+
Project-URL: Homepage, https://github.com/unrealos/unrealon-rpc
|
|
6
|
+
Project-URL: Documentation, https://unrealon-rpc.readthedocs.io
|
|
7
|
+
Project-URL: Repository, https://github.com/unrealos/unrealon-rpc.git
|
|
8
|
+
Project-URL: Issues, https://github.com/unrealos/unrealon-rpc/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/unrealos/unrealon-rpc/blob/main/CHANGELOG.md
|
|
10
|
+
Author-email: UnrealOS Team <dev@unrealos.com>
|
|
11
|
+
Maintainer-email: UnrealOS Team <dev@unrealos.com>
|
|
12
|
+
License: MIT
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Keywords: async,bridge,browser-automation,html-processing,parsing,playwright,pydantic,stealth,web-scraping,websocket
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Topic :: Communications
|
|
25
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
28
|
+
Classifier: Typing :: Typed
|
|
29
|
+
Requires-Python: <4.0,>=3.10
|
|
30
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
31
|
+
Requires-Dist: asyncio-mqtt>=0.16.0
|
|
32
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
33
|
+
Requires-Dist: click>=8.2.0
|
|
34
|
+
Requires-Dist: httpx>=0.26.0
|
|
35
|
+
Requires-Dist: ipfshttpclient>=0.8.0a2
|
|
36
|
+
Requires-Dist: lxml>=5.0.0
|
|
37
|
+
Requires-Dist: playwright-stealth<2.0.0,>=1.0.5
|
|
38
|
+
Requires-Dist: playwright>=1.40.0
|
|
39
|
+
Requires-Dist: pydantic<3.0,>=2.11
|
|
40
|
+
Requires-Dist: python-dateutil>=2.8
|
|
41
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
42
|
+
Requires-Dist: pyyaml>=6.0
|
|
43
|
+
Requires-Dist: redis>=5.0.0
|
|
44
|
+
Requires-Dist: rich>=13.0.0
|
|
45
|
+
Requires-Dist: tomlkit>=0.13.0
|
|
46
|
+
Requires-Dist: websockets>=12.0
|
|
47
|
+
Provides-Extra: dev
|
|
48
|
+
Requires-Dist: bandit>=1.7.0; extra == 'dev'
|
|
49
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
50
|
+
Requires-Dist: build>=1.0.0; extra == 'dev'
|
|
51
|
+
Requires-Dist: flake8>=6.0.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: isort>=5.12.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: mkdocs-material>=9.0.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: mkdocs>=1.5.0; extra == 'dev'
|
|
55
|
+
Requires-Dist: mkdocstrings[python]>=0.22.0; extra == 'dev'
|
|
56
|
+
Requires-Dist: mypy>=1.5.0; extra == 'dev'
|
|
57
|
+
Requires-Dist: pre-commit>=3.0.0; extra == 'dev'
|
|
58
|
+
Requires-Dist: pydocstyle>=6.3.0; extra == 'dev'
|
|
59
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
60
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
61
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == 'dev'
|
|
62
|
+
Requires-Dist: pytest-xdist>=3.0.0; extra == 'dev'
|
|
63
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
64
|
+
Requires-Dist: questionary>=2.1.0; extra == 'dev'
|
|
65
|
+
Requires-Dist: twine>=4.0.0; extra == 'dev'
|
|
66
|
+
Provides-Extra: docs
|
|
67
|
+
Requires-Dist: mkdocs-material>=9.0.0; extra == 'docs'
|
|
68
|
+
Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
|
|
69
|
+
Requires-Dist: mkdocstrings[python]>=0.22.0; extra == 'docs'
|
|
70
|
+
Requires-Dist: pymdown-extensions>=10.0.0; extra == 'docs'
|
|
71
|
+
Provides-Extra: test
|
|
72
|
+
Requires-Dist: factory-boy>=3.2.0; extra == 'test'
|
|
73
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'test'
|
|
74
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'test'
|
|
75
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == 'test'
|
|
76
|
+
Requires-Dist: pytest-xdist>=3.0.0; extra == 'test'
|
|
77
|
+
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
78
|
+
Description-Content-Type: text/markdown
|
|
79
|
+
|
|
80
|
+
# UnrealOn
|
|
81
|
+
|
|
82
|
+
**Enterprise browser automation framework with WebSocket bridge for distributed web scraping.**
|
|
83
|
+
|
|
84
|
+
[](https://badge.fury.io/py/unrealon)
|
|
85
|
+
[](https://www.python.org/downloads/)
|
|
86
|
+
[](https://opensource.org/licenses/MIT)
|
|
87
|
+
|
|
88
|
+
## 🌐 About UnrealOn
|
|
89
|
+
|
|
90
|
+
**UnrealOn** is a comprehensive web scraping platform that provides enterprise-grade infrastructure for data extraction at scale. Built on top of this framework, the **UnrealOn Server** offers managed hosting, real-time orchestration, and advanced anti-bot protection.
|
|
91
|
+
|
|
92
|
+
**Platform**: [unrealon.com](https://unrealon.com) - Enterprise web scraping infrastructure
|
|
93
|
+
**Framework**: This repository - Open-source parser development framework
|
|
94
|
+
|
|
95
|
+
**No Vendor Lock-in**: Use the framework locally for development, then deploy to any infrastructure - self-hosted, cloud, or managed UnrealOn Server.
|
|
96
|
+
|
|
97
|
+
## 🚀 Quick Start
|
|
98
|
+
|
|
99
|
+
**Get started in minutes with our simple parser framework:**
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Install the framework
|
|
103
|
+
pip install unrealon
|
|
104
|
+
|
|
105
|
+
# Create your first parser
|
|
106
|
+
from unrealon import ParserManager, BrowserManager
|
|
107
|
+
|
|
108
|
+
# That's it! You're ready to build parsers
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## 📦 Ready-to-Use Amazon Parser
|
|
112
|
+
|
|
113
|
+
**Get started immediately with our pre-configured Amazon parser:**
|
|
114
|
+
- **[GitHub Repository](https://github.com/markolofsen/unrealon-parser-amazon)** - Complete Amazon parser with all configurations
|
|
115
|
+
- **Zero Setup**: Clone and run with minimal configuration
|
|
116
|
+
- **Production Ready**: Includes all enterprise features and optimizations
|
|
117
|
+
- **Real Examples**: See how to build production parsers with UnrealOn
|
|
118
|
+
|
|
119
|
+
## Overview
|
|
120
|
+
|
|
121
|
+
UnrealOn is a modern Python framework for building web scrapers with enterprise-grade features. It combines browser automation, AI-powered extraction, and real-time orchestration in a simple, developer-friendly package.
|
|
122
|
+
|
|
123
|
+
**Key Features:**
|
|
124
|
+
- **Zero Configuration**: Everything works out of the box
|
|
125
|
+
- **Browser Automation**: Built-in Playwright with stealth capabilities
|
|
126
|
+
- **AI-Powered Extraction**: Automatic content analysis and selector generation
|
|
127
|
+
- **Real-Time Communication**: WebSocket bridge for distributed parsing
|
|
128
|
+
- **Enterprise Ready**: Logging, monitoring, and error handling included
|
|
129
|
+
|
|
130
|
+
## Why Choose UnrealOn?
|
|
131
|
+
|
|
132
|
+
### 🆚 **Simple Comparison**
|
|
133
|
+
|
|
134
|
+
| Feature | UnrealOn | Scrapy | Selenium | Custom Solution |
|
|
135
|
+
|---------|----------|--------|----------|-----------------|
|
|
136
|
+
| **Setup Time** | ✅ 5 minutes | ❌ 30+ minutes | ❌ 20+ minutes | ❌ Hours |
|
|
137
|
+
| **Browser Automation** | ✅ Built-in | ❌ Requires setup | ✅ Built-in | ❌ Manual |
|
|
138
|
+
| **AI Extraction** | ✅ Automatic | ❌ Manual | ❌ Manual | ❌ Custom dev |
|
|
139
|
+
| **Real-time Communication** | ✅ WebSocket | ❌ HTTP only | ❌ HTTP only | ❌ Manual |
|
|
140
|
+
| **Proxy Support** | ✅ Auto-rotation | ❌ Manual | ❌ Manual | ❌ Manual |
|
|
141
|
+
| **Error Handling** | ✅ Built-in | ❌ Manual | ❌ Manual | ❌ Manual |
|
|
142
|
+
| **Logging** | ✅ Structured | ❌ Basic | ❌ Basic | ❌ Manual |
|
|
143
|
+
|
|
144
|
+
### 💎 **Key Advantages**
|
|
145
|
+
|
|
146
|
+
#### **1. Zero Configuration**
|
|
147
|
+
**Problem**: Complex setup with multiple dependencies
|
|
148
|
+
**Solution**: Install and start coding immediately
|
|
149
|
+
```python
|
|
150
|
+
# No config files needed - everything works out of the box
|
|
151
|
+
from unrealon import ParserManager
|
|
152
|
+
parser = ParserManager()
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
#### **2. Built-in Browser Automation**
|
|
156
|
+
**Problem**: Manual browser setup and management
|
|
157
|
+
**Solution**: Automatic browser handling with stealth
|
|
158
|
+
```python
|
|
159
|
+
# Browser automation with one line
|
|
160
|
+
browser = BrowserManager()
|
|
161
|
+
await browser.navigate("https://example.com")
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
#### **3. AI-Powered Extraction**
|
|
165
|
+
**Problem**: Manual selector writing and maintenance
|
|
166
|
+
**Solution**: Automatic content analysis and extraction
|
|
167
|
+
```python
|
|
168
|
+
# AI automatically finds and extracts data
|
|
169
|
+
result = await parser.extract_with_ai("https://example.com")
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
#### **4. Real-Time Communication**
|
|
173
|
+
**Problem**: Batch processing with delayed results
|
|
174
|
+
**Solution**: Instant command execution and monitoring
|
|
175
|
+
```python
|
|
176
|
+
# Real-time parser management
|
|
177
|
+
await parser.start_daemon() # Listens for commands
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Installation
|
|
181
|
+
|
|
182
|
+
### Prerequisites
|
|
183
|
+
|
|
184
|
+
- Python 3.9 or higher
|
|
185
|
+
- pip or poetry for package management
|
|
186
|
+
|
|
187
|
+
### Quick Installation
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
# Install with pip
|
|
191
|
+
pip install unrealon
|
|
192
|
+
|
|
193
|
+
# Or with poetry
|
|
194
|
+
poetry add unrealon
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Development Installation
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
# Clone the repository
|
|
201
|
+
git clone https://github.com/unrealos/unrealon-rpc.git
|
|
202
|
+
cd unrealon-rpc
|
|
203
|
+
|
|
204
|
+
# Install in development mode
|
|
205
|
+
pip install -e .
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Simple Examples
|
|
209
|
+
|
|
210
|
+
### 1. Basic Parser (5 minutes)
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from unrealon import ParserManager
|
|
214
|
+
import asyncio
|
|
215
|
+
|
|
216
|
+
class SimpleParser(ParserManager):
|
|
217
|
+
"""Simple product parser - just write your logic!"""
|
|
218
|
+
|
|
219
|
+
async def parse_products(self, url: str):
|
|
220
|
+
"""Parse products from a listing page."""
|
|
221
|
+
# Navigate to the page
|
|
222
|
+
await self.browser.navigate(url)
|
|
223
|
+
|
|
224
|
+
# Extract data using AI (automatic selectors)
|
|
225
|
+
result = await self.extract_with_ai(
|
|
226
|
+
url,
|
|
227
|
+
instruction="Extract all product information including title, price, and image"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return result.data
|
|
231
|
+
|
|
232
|
+
# Usage
|
|
233
|
+
async def main():
|
|
234
|
+
parser = SimpleParser()
|
|
235
|
+
await parser.setup()
|
|
236
|
+
|
|
237
|
+
products = await parser.parse_products("https://example.com/products")
|
|
238
|
+
print(f"Found {len(products)} products")
|
|
239
|
+
|
|
240
|
+
await parser.cleanup()
|
|
241
|
+
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
asyncio.run(main())
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### 2. Traditional Parser with BeautifulSoup
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
from unrealon import ParserManager
|
|
250
|
+
from bs4 import BeautifulSoup
|
|
251
|
+
|
|
252
|
+
class TraditionalParser(ParserManager):
|
|
253
|
+
"""Traditional parser using CSS selectors."""
|
|
254
|
+
|
|
255
|
+
async def parse_products(self, url: str):
|
|
256
|
+
"""Parse products using CSS selectors."""
|
|
257
|
+
# Get HTML content
|
|
258
|
+
html = await self.browser.get_html(url)
|
|
259
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
260
|
+
|
|
261
|
+
products = []
|
|
262
|
+
for item in soup.select(".product-item"):
|
|
263
|
+
product = {
|
|
264
|
+
"title": item.select_one(".title").text.strip(),
|
|
265
|
+
"price": item.select_one(".price").text.strip(),
|
|
266
|
+
"image": item.select_one("img")["src"]
|
|
267
|
+
}
|
|
268
|
+
products.append(product)
|
|
269
|
+
|
|
270
|
+
return products
|
|
271
|
+
|
|
272
|
+
# Usage
|
|
273
|
+
async def main():
|
|
274
|
+
parser = TraditionalParser()
|
|
275
|
+
await parser.setup()
|
|
276
|
+
|
|
277
|
+
products = await parser.parse_products("https://example.com/products")
|
|
278
|
+
print(f"Found {len(products)} products")
|
|
279
|
+
|
|
280
|
+
await parser.cleanup()
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### 3. Daemon Mode for Real-Time Processing
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
from unrealon import ParserManager
|
|
287
|
+
|
|
288
|
+
class DaemonParser(ParserManager):
|
|
289
|
+
"""Parser that runs as a daemon for real-time commands."""
|
|
290
|
+
|
|
291
|
+
async def handle_parse_command(self, command):
|
|
292
|
+
"""Handle remote parse commands."""
|
|
293
|
+
url = command.data.get("url")
|
|
294
|
+
return await self.parse_products(url)
|
|
295
|
+
|
|
296
|
+
async def parse_products(self, url: str):
|
|
297
|
+
"""Parse products from URL."""
|
|
298
|
+
await self.browser.navigate(url)
|
|
299
|
+
result = await self.extract_with_ai(url, "Extract products")
|
|
300
|
+
return result.data
|
|
301
|
+
|
|
302
|
+
# Run as daemon
|
|
303
|
+
async def main():
|
|
304
|
+
parser = DaemonParser()
|
|
305
|
+
await parser.start_daemon() # Listens for commands
|
|
306
|
+
|
|
307
|
+
if __name__ == "__main__":
|
|
308
|
+
asyncio.run(main())
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### 4. Scheduled Parser
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
from unrealon import ParserManager
|
|
315
|
+
import asyncio
|
|
316
|
+
|
|
317
|
+
class ScheduledParser(ParserManager):
|
|
318
|
+
"""Parser that runs on a schedule."""
|
|
319
|
+
|
|
320
|
+
async def run_scheduled(self):
|
|
321
|
+
"""Main method called by scheduler."""
|
|
322
|
+
urls = [
|
|
323
|
+
"https://example.com/products",
|
|
324
|
+
"https://example.com/deals",
|
|
325
|
+
"https://example.com/new"
|
|
326
|
+
]
|
|
327
|
+
|
|
328
|
+
all_products = []
|
|
329
|
+
for url in urls:
|
|
330
|
+
products = await self.parse_products(url)
|
|
331
|
+
all_products.extend(products)
|
|
332
|
+
|
|
333
|
+
return {"products": all_products, "count": len(all_products)}
|
|
334
|
+
|
|
335
|
+
# Run with scheduling
|
|
336
|
+
async def main():
|
|
337
|
+
parser = ScheduledParser()
|
|
338
|
+
await parser.setup()
|
|
339
|
+
|
|
340
|
+
# Run once
|
|
341
|
+
result = await parser.run_scheduled()
|
|
342
|
+
print(f"Collected {result['count']} products")
|
|
343
|
+
|
|
344
|
+
await parser.cleanup()
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
## CLI Tools
|
|
348
|
+
|
|
349
|
+
### Browser Automation CLI
|
|
350
|
+
|
|
351
|
+
```bash
|
|
352
|
+
# Launch browser session
|
|
353
|
+
unrealon-browser browser launch --url https://example.com
|
|
354
|
+
|
|
355
|
+
# Test stealth capabilities
|
|
356
|
+
unrealon-browser browser stealth-test --url https://example.com
|
|
357
|
+
|
|
358
|
+
# Interactive browser mode
|
|
359
|
+
unrealon-browser browser interactive
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
### Cookie Management CLI
|
|
363
|
+
|
|
364
|
+
```bash
|
|
365
|
+
# List stored cookies
|
|
366
|
+
browser-cookies list --parser my_parser
|
|
367
|
+
|
|
368
|
+
# Clear cookies
|
|
369
|
+
browser-cookies clear --parser my_parser
|
|
370
|
+
|
|
371
|
+
# Show cookie statistics
|
|
372
|
+
browser-cookies stats --parser my_parser
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
## Configuration
|
|
376
|
+
|
|
377
|
+
### Environment Variables (Optional)
|
|
378
|
+
|
|
379
|
+
Create a `.env` file for custom settings:
|
|
380
|
+
|
|
381
|
+
```bash
|
|
382
|
+
# Browser settings
|
|
383
|
+
BROWSER_HEADLESS=true
|
|
384
|
+
BROWSER_TIMEOUT=30
|
|
385
|
+
|
|
386
|
+
# Logging
|
|
387
|
+
LOG_LEVEL=INFO
|
|
388
|
+
LOG_TO_FILE=true
|
|
389
|
+
|
|
390
|
+
# Proxy settings (optional)
|
|
391
|
+
PROXY_HOST=proxy.example.com
|
|
392
|
+
PROXY_PORT=8080
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
### Custom Configuration
|
|
396
|
+
|
|
397
|
+
```python
|
|
398
|
+
from unrealon import ParserManager, BrowserConfig
|
|
399
|
+
|
|
400
|
+
# Custom browser configuration
|
|
401
|
+
browser_config = BrowserConfig(
|
|
402
|
+
headless=True,
|
|
403
|
+
timeout=30,
|
|
404
|
+
user_agent="Custom User Agent"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Use custom config
|
|
408
|
+
parser = ParserManager(browser_config=browser_config)
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
## Advanced Features
|
|
412
|
+
|
|
413
|
+
### AI-Powered Extraction
|
|
414
|
+
|
|
415
|
+
```python
|
|
416
|
+
# Automatic content analysis
|
|
417
|
+
result = await parser.extract_with_ai(
|
|
418
|
+
url="https://example.com/products",
|
|
419
|
+
instruction="Extract product title, price, rating, and review count",
|
|
420
|
+
confidence_threshold=0.8
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
print(f"Extracted {len(result.data)} items")
|
|
424
|
+
print(f"Confidence: {result.confidence}")
|
|
425
|
+
print(f"Cost: ${result.cost_usd}")
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
### Proxy Management
|
|
429
|
+
|
|
430
|
+
```python
|
|
431
|
+
from unrealon import ProxyConfig
|
|
432
|
+
|
|
433
|
+
# Configure proxy rotation
|
|
434
|
+
proxy_config = ProxyConfig(
|
|
435
|
+
proxies=[
|
|
436
|
+
"http://proxy1:8080",
|
|
437
|
+
"http://proxy2:8080",
|
|
438
|
+
"http://proxy3:8080"
|
|
439
|
+
],
|
|
440
|
+
rotation_strategy="round_robin"
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
parser = ParserManager(proxy_config=proxy_config)
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
### Real-Time Communication
|
|
447
|
+
|
|
448
|
+
```python
|
|
449
|
+
# Start daemon with WebSocket connection
|
|
450
|
+
await parser.start_daemon(
|
|
451
|
+
server_url="wss://api.unrealon.com",
|
|
452
|
+
api_key="your_api_key"
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Parser now listens for remote commands
|
|
456
|
+
# Commands can be sent from any client
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
## Architecture
|
|
460
|
+
|
|
461
|
+
### Core Components
|
|
462
|
+
|
|
463
|
+
```
|
|
464
|
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
465
|
+
│ Parser SDK │◄──►│ WebSocket │◄──►│ Target │
|
|
466
|
+
│ (Client) │ │ Bridge │ │ Websites │
|
|
467
|
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
468
|
+
│ │ │
|
|
469
|
+
▼ ▼ ▼
|
|
470
|
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
471
|
+
│ Browser │ │ AI Services │ │ Proxy & │
|
|
472
|
+
│ Automation │ │ (LLM) │ │ Stealth │
|
|
473
|
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
### Module Structure
|
|
477
|
+
|
|
478
|
+
- **`unrealon_driver`**: Core parser framework and management
|
|
479
|
+
- **`unrealon_bridge`**: WebSocket communication and orchestration
|
|
480
|
+
- **`unrealon_browser`**: Browser automation with stealth capabilities
|
|
481
|
+
|
|
482
|
+
## Best Practices
|
|
483
|
+
|
|
484
|
+
### 1. Error Handling
|
|
485
|
+
|
|
486
|
+
```python
|
|
487
|
+
class RobustParser(ParserManager):
|
|
488
|
+
async def parse_products(self, url: str):
|
|
489
|
+
try:
|
|
490
|
+
await self.browser.navigate(url)
|
|
491
|
+
result = await self.extract_with_ai(url, "Extract products")
|
|
492
|
+
return result.data
|
|
493
|
+
except Exception as e:
|
|
494
|
+
self.logger.error(f"Parsing failed: {e}")
|
|
495
|
+
return {"error": str(e), "success": False}
|
|
496
|
+
```
|
|
497
|
+
|
|
498
|
+
### 2. Rate Limiting
|
|
499
|
+
|
|
500
|
+
```python
|
|
501
|
+
import asyncio
|
|
502
|
+
|
|
503
|
+
class RateLimitedParser(ParserManager):
|
|
504
|
+
async def parse_multiple_pages(self, urls: list):
|
|
505
|
+
results = []
|
|
506
|
+
for url in urls:
|
|
507
|
+
result = await self.parse_products(url)
|
|
508
|
+
results.append(result)
|
|
509
|
+
|
|
510
|
+
# Rate limiting
|
|
511
|
+
await asyncio.sleep(2) # 2 second delay
|
|
512
|
+
|
|
513
|
+
return results
|
|
514
|
+
```
|
|
515
|
+
|
|
516
|
+
### 3. Data Validation
|
|
517
|
+
|
|
518
|
+
```python
|
|
519
|
+
from pydantic import BaseModel
|
|
520
|
+
|
|
521
|
+
class Product(BaseModel):
|
|
522
|
+
title: str
|
|
523
|
+
price: float
|
|
524
|
+
image_url: str
|
|
525
|
+
|
|
526
|
+
class ValidatedParser(ParserManager):
|
|
527
|
+
async def parse_products(self, url: str):
|
|
528
|
+
raw_data = await self.extract_with_ai(url, "Extract products")
|
|
529
|
+
|
|
530
|
+
# Validate data
|
|
531
|
+
products = []
|
|
532
|
+
for item in raw_data.data:
|
|
533
|
+
try:
|
|
534
|
+
product = Product(**item)
|
|
535
|
+
products.append(product)
|
|
536
|
+
except Exception as e:
|
|
537
|
+
self.logger.warning(f"Invalid product data: {e}")
|
|
538
|
+
|
|
539
|
+
return products
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
## Testing
|
|
543
|
+
|
|
544
|
+
### Unit Tests
|
|
545
|
+
|
|
546
|
+
```python
|
|
547
|
+
import pytest
|
|
548
|
+
from unrealon import ParserManager
|
|
549
|
+
|
|
550
|
+
class TestParser(ParserManager):
|
|
551
|
+
async def parse_products(self, url: str):
|
|
552
|
+
return [{"title": "Test Product", "price": 99.99}]
|
|
553
|
+
|
|
554
|
+
@pytest.mark.asyncio
|
|
555
|
+
async def test_parser():
|
|
556
|
+
parser = TestParser()
|
|
557
|
+
await parser.setup()
|
|
558
|
+
|
|
559
|
+
result = await parser.parse_products("https://example.com")
|
|
560
|
+
assert len(result) == 1
|
|
561
|
+
assert result[0]["title"] == "Test Product"
|
|
562
|
+
|
|
563
|
+
await parser.cleanup()
|
|
564
|
+
```
|
|
565
|
+
|
|
566
|
+
### Integration Tests
|
|
567
|
+
|
|
568
|
+
```python
|
|
569
|
+
@pytest.mark.asyncio
|
|
570
|
+
async def test_browser_integration():
|
|
571
|
+
parser = ParserManager()
|
|
572
|
+
await parser.setup()
|
|
573
|
+
|
|
574
|
+
# Test actual browser navigation
|
|
575
|
+
await parser.browser.navigate("https://httpbin.org/html")
|
|
576
|
+
html = await parser.browser.get_html()
|
|
577
|
+
|
|
578
|
+
assert "Herman Melville" in html # httpbin test content
|
|
579
|
+
|
|
580
|
+
await parser.cleanup()
|
|
581
|
+
```
|
|
582
|
+
|
|
583
|
+
## Deployment
|
|
584
|
+
|
|
585
|
+
### Docker Deployment
|
|
586
|
+
|
|
587
|
+
```dockerfile
|
|
588
|
+
FROM python:3.11-slim
|
|
589
|
+
|
|
590
|
+
WORKDIR /app
|
|
591
|
+
COPY requirements.txt .
|
|
592
|
+
RUN pip install -r requirements.txt
|
|
593
|
+
|
|
594
|
+
COPY . .
|
|
595
|
+
CMD ["python", "parser.py"]
|
|
596
|
+
```
|
|
597
|
+
|
|
598
|
+
### Docker Compose
|
|
599
|
+
|
|
600
|
+
```yaml
|
|
601
|
+
version: '3.8'
|
|
602
|
+
services:
|
|
603
|
+
parser:
|
|
604
|
+
build: .
|
|
605
|
+
environment:
|
|
606
|
+
- LOG_LEVEL=INFO
|
|
607
|
+
volumes:
|
|
608
|
+
- ./logs:/app/logs
|
|
609
|
+
restart: unless-stopped
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
## Troubleshooting
|
|
613
|
+
|
|
614
|
+
### Common Issues
|
|
615
|
+
|
|
616
|
+
#### 1. Browser Launch Fails
|
|
617
|
+
```bash
|
|
618
|
+
# Install browser dependencies
|
|
619
|
+
playwright install
|
|
620
|
+
```
|
|
621
|
+
|
|
622
|
+
#### 2. Import Errors
|
|
623
|
+
```bash
|
|
624
|
+
# Ensure correct installation
|
|
625
|
+
pip install unrealon --upgrade
|
|
626
|
+
```
|
|
627
|
+
|
|
628
|
+
#### 3. Proxy Issues
|
|
629
|
+
```python
|
|
630
|
+
# Test proxy connection
|
|
631
|
+
await parser.browser.test_proxy("http://proxy:8080")
|
|
632
|
+
```
|
|
633
|
+
|
|
634
|
+
### Debug Mode
|
|
635
|
+
|
|
636
|
+
```python
|
|
637
|
+
# Enable debug logging
|
|
638
|
+
import logging
|
|
639
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
640
|
+
|
|
641
|
+
# Or use environment variable
|
|
642
|
+
# LOG_LEVEL=DEBUG
|
|
643
|
+
```
|
|
644
|
+
|
|
645
|
+
## Contributing
|
|
646
|
+
|
|
647
|
+
### Development Setup
|
|
648
|
+
|
|
649
|
+
```bash
|
|
650
|
+
# Clone repository
|
|
651
|
+
git clone https://github.com/unrealos/unrealon-rpc.git
|
|
652
|
+
cd unrealon-rpc
|
|
653
|
+
|
|
654
|
+
# Install development dependencies
|
|
655
|
+
pip install -e ".[dev]"
|
|
656
|
+
|
|
657
|
+
# Run tests
|
|
658
|
+
pytest
|
|
659
|
+
|
|
660
|
+
# Run linting
|
|
661
|
+
black src/
|
|
662
|
+
isort src/
|
|
663
|
+
```
|
|
664
|
+
|
|
665
|
+
### Code Style
|
|
666
|
+
|
|
667
|
+
- Follow PEP 8
|
|
668
|
+
- Use type hints
|
|
669
|
+
- Write docstrings
|
|
670
|
+
- Add tests for new features
|
|
671
|
+
|
|
672
|
+
## License
|
|
673
|
+
|
|
674
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
675
|
+
|
|
676
|
+
## Support
|
|
677
|
+
|
|
678
|
+
- **Documentation**: [docs.unrealon.com](https://docs.unrealon.com)
|
|
679
|
+
- **Issues**: [GitHub Issues](https://github.com/unrealos/unrealon-rpc/issues)
|
|
680
|
+
- **Discussions**: [GitHub Discussions](https://github.com/unrealos/unrealon-rpc/discussions)
|
|
681
|
+
|
|
682
|
+
---
|
|
683
|
+
|
|
684
|
+
## Real Projects Built on UnrealOn
|
|
685
|
+
|
|
686
|
+
### 🚗 **CarAPIs** - Automotive Data Platform
|
|
687
|
+
**Platform**: [carapis.com](https://carapis.com)
|
|
688
|
+
**Use Case**: Vehicle information extraction from dealerships and marketplaces
|
|
689
|
+
**Features**: Real-time car listings, pricing analysis, market trends
|
|
690
|
+
**Technology**: AI-powered vehicle data extraction with 95% accuracy
|
|
691
|
+
|
|
692
|
+
### 🛒 **ShopAPIs** - E-commerce Intelligence
|
|
693
|
+
**Platform**: [shopapis.com](https://shopapis.com)
|
|
694
|
+
**Use Case**: Product monitoring and competitive analysis
|
|
695
|
+
**Features**: Price tracking, inventory monitoring, competitor analysis
|
|
696
|
+
**Technology**: Multi-platform e-commerce data collection
|
|
697
|
+
|
|
698
|
+
### 📊 **StockAPIs** - Financial Data Platform
|
|
699
|
+
**Platform**: [stockapis.com](https://stockapis.com)
|
|
700
|
+
**Use Case**: Market data and financial information extraction
|
|
701
|
+
**Features**: Real-time stock data, financial news analysis
|
|
702
|
+
**Technology**: High-frequency financial data collection
|
|
703
|
+
|
|
704
|
+
### 🏠 **PropAPIs** - Real Estate Data Platform
|
|
705
|
+
**Platform**: [propapis.com](https://propapis.com)
|
|
706
|
+
**Use Case**: Property listings and market analysis
|
|
707
|
+
**Features**: Real estate listings, price monitoring, market trends
|
|
708
|
+
**Technology**: Multi-source property data extraction
|
|
709
|
+
|
|
710
|
+
**All platforms built with UnrealOn for reliable, scalable data extraction.**
|
|
711
|
+
|
|
712
|
+
## License
|
|
713
|
+
|
|
714
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
715
|
+
|
|
716
|
+
## Commercial Platform
|
|
717
|
+
|
|
718
|
+
For enterprise features, managed hosting, and professional support, visit [unrealon.com](https://unrealon.com/).
|
|
719
|
+
|
|
720
|
+
---
|
|
721
|
+
|
|
722
|
+
**UnrealOn** - Simple, powerful web scraping for developers.
|