zenx 0.9.5__tar.gz → 0.9.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {zenx-0.9.5 → zenx-0.9.7}/PKG-INFO +2 -1
- zenx-0.9.7/README.md +212 -0
- {zenx-0.9.5 → zenx-0.9.7}/pyproject.toml +2 -1
- {zenx-0.9.5 → zenx-0.9.7}/zenx/clients/http.py +10 -6
- {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/discord.py +5 -1
- {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/google_rpc.py +4 -4
- {zenx-0.9.5 → zenx-0.9.7}/zenx/settings.py +3 -4
- {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/PKG-INFO +2 -1
- {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/SOURCES.txt +1 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/requires.txt +1 -0
- {zenx-0.9.5 → zenx-0.9.7}/setup.cfg +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/cli.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/clients/__init__.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/clients/database.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/debug_runner.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/discovery.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/engine.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/exceptions.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/logger.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/__init__.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/base.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/manager.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/preprocess.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/websocket.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/resources/proto/__init__.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/resources/proto/feed_pb2.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/resources/proto/feed_pb2_grpc.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/spiders/__init__.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/spiders/base.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx/utils.py +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/dependency_links.txt +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/entry_points.txt +0 -0
- {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/top_level.txt +0 -0
@@ -1,9 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: zenx
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.7
|
4
4
|
Summary: mini-framework
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Requires-Dist: curl-cffi>=0.12.0
|
7
|
+
Requires-Dist: orjson>=3.11.0
|
7
8
|
Requires-Dist: parsel>=1.10.0
|
8
9
|
Requires-Dist: pebble>=5.1.1
|
9
10
|
Requires-Dist: pydantic>=2.11.7
|
zenx-0.9.7/README.md
ADDED
@@ -0,0 +1,212 @@
|
|
1
|
+
|
2
|
+
# ZenX
|
3
|
+
|
4
|
+
A fast, efficient and minimal web scraping framework built on top of asyncio and uvloop.
|
5
|
+
|
6
|
+
## Table of Contents
|
7
|
+
|
8
|
+
- [High-Level Overview](#high-level-overview)
|
9
|
+
- [Installation](#installation)
|
10
|
+
- [Quickstart](#quickstart)
|
11
|
+
- [1. Create a new project](#1-create-a-new-project)
|
12
|
+
- [2. Define a spider](#2-define-a-spider)
|
13
|
+
- [3. Run the spider](#3-run-the-spider)
|
14
|
+
- [4. List available spiders](#4-list-available-spiders)
|
15
|
+
- [Configuration](#configuration)
|
16
|
+
- [Built-in Components](#built-in-components)
|
17
|
+
- [HTTP Clients](#http-clients)
|
18
|
+
- [Databases](#databases)
|
19
|
+
- [Pipelines](#pipelines)
|
20
|
+
|
21
|
+
## High-Level Overview
|
22
|
+
|
23
|
+
The framework is composed of the following key components:
|
24
|
+
|
25
|
+
- **Spiders**: Responsible for fetching web pages and extracting data. Each spider is a class that implements `zenx.spiders.base.Spider` interface.
|
26
|
+
|
27
|
+
- **Pipelines**: Process the data extracted by spiders. Each pipeline is a class that implements `zenx.pipelines.base.Pipeline` interface. Pipelines can be used to clean, validate, store or forward data to third party services.
|
28
|
+
|
29
|
+
- **Engine**: Responsible for managing the life-cylce of spiders.
|
30
|
+
|
31
|
+
- **CLI**: Command-line interface (CLI) for managing spiders and running crawls.
|
32
|
+
|
33
|
+
## Installation
|
34
|
+
|
35
|
+
To install ZenX, you can use pip:
|
36
|
+
|
37
|
+
```bash
|
38
|
+
pip install zenx
|
39
|
+
```
|
40
|
+
|
41
|
+
To install ZenX with all optional dependencies for built-in components (Redis, gRPC, WebSockets, Discord), you can use:
|
42
|
+
|
43
|
+
```bash
|
44
|
+
pip install 'zenx[all]'
|
45
|
+
```
|
46
|
+
|
47
|
+
Alternatively, you can install specific optional dependencies:
|
48
|
+
|
49
|
+
```bash
|
50
|
+
pip install 'zenx[redis]' # For Redis database support
|
51
|
+
pip install 'zenx[grpc]' # For gRPC pipeline support
|
52
|
+
pip install 'zenx[websocket]' # For WebSocket pipeline support
|
53
|
+
pip install 'zenx[discord]' # For Discord pipeline support
|
54
|
+
```
|
55
|
+
|
56
|
+
## Quickstart
|
57
|
+
|
58
|
+
To get started with ZenX, you can create a new project and define a spider.
|
59
|
+
|
60
|
+
### 1. Create a new project
|
61
|
+
|
62
|
+
```bash
|
63
|
+
zenx startproject myproject
|
64
|
+
```
|
65
|
+
|
66
|
+
This will create a new directory called `myproject` with the following structure:
|
67
|
+
|
68
|
+
```
|
69
|
+
myproject/
|
70
|
+
└── spiders/
|
71
|
+
└── __init__.py
|
72
|
+
```
|
73
|
+
|
74
|
+
### 2. Define a spider
|
75
|
+
|
76
|
+
Create a new file in the `myproject/spiders` directory (e.g., `myproject/spiders/myspider.py`) and define a spider:
|
77
|
+
|
78
|
+
```python
|
79
|
+
from zenx.spiders.base import Spider
|
80
|
+
from zenx.clients.http import Response
|
81
|
+
|
82
|
+
class MySpider(Spider):
|
83
|
+
name = "myspider"
|
84
|
+
pipelines = ["preprocess"] # multiple pipelines can be passed here
|
85
|
+
|
86
|
+
async def crawl(self) -> None:
|
87
|
+
response = await self.client.get("https://example.com")
|
88
|
+
await self.process_response(response)
|
89
|
+
|
90
|
+
async def process_response(self, response: Response) -> None:
|
91
|
+
item = self.parse(response)
|
92
|
+
# Asynchronously handle the pipelines processing
|
93
|
+
self.create_task(self.pm.process_item(item, self.name))
|
94
|
+
|
95
|
+
def parse(self, response: Response) -> Dict:
|
96
|
+
return {
|
97
|
+
"_id": 1,
|
98
|
+
"title": response.xpath("//h1/text()").get(),
|
99
|
+
}
|
100
|
+
```
|
101
|
+
|
102
|
+
### 3. Run the spider
|
103
|
+
|
104
|
+
To run the spider, use the `crawl` command:
|
105
|
+
|
106
|
+
```bash
|
107
|
+
zenx crawl myspider
|
108
|
+
```
|
109
|
+
|
110
|
+
This will run the `myspider` spider and print the extracted data to the console.
|
111
|
+
|
112
|
+
#### Running Multiple Spiders
|
113
|
+
|
114
|
+
You can run multiple spiders at once by passing their names to the `crawl` command:
|
115
|
+
|
116
|
+
```bash
|
117
|
+
zenx crawl spider1 spider2
|
118
|
+
```
|
119
|
+
|
120
|
+
To run all available spiders, use the `all` keyword:
|
121
|
+
|
122
|
+
```bash
|
123
|
+
zenx crawl all
|
124
|
+
```
|
125
|
+
|
126
|
+
#### Forever Mode
|
127
|
+
|
128
|
+
ZenX can run spiders continuously in "forever mode". This is useful for long-running spiders that need to monitor websites for changes. To enable forever mode, use the `--forever` flag:
|
129
|
+
|
130
|
+
```bash
|
131
|
+
zenx crawl myspider --forever
|
132
|
+
```
|
133
|
+
|
134
|
+
### 4. List available spiders
|
135
|
+
|
136
|
+
To see a list of available spiders, use the `list` command:
|
137
|
+
|
138
|
+
```bash
|
139
|
+
zenx list
|
140
|
+
```
|
141
|
+
|
142
|
+
## Configuration
|
143
|
+
|
144
|
+
ZenX allows for flexible configuration through environment variables or a `.env` file. Below are the key settings you can adjust to customize its behavior:
|
145
|
+
|
146
|
+
- `APP_ENV`: Specifies the application environment (e.g., `dev`, `prod`).
|
147
|
+
- `SESSION_POOL_SIZE`: Defines the number of sessions in session pool.
|
148
|
+
- `MAX_SCRAPE_DELAY`: Sets the maximum allowed delay (in seconds) between an item's publication and its scraping.
|
149
|
+
- `DQ_MAX_SIZE`: Configures the maximum size of the deque used by the in-memory database.
|
150
|
+
- `REDIS_RECORD_EXPIRY_SECONDS`: Determines how long (in seconds) records are stored in Redis before expiring.
|
151
|
+
- `DB_TYPE`: Selects the database backend to use (`memory` for in-memory, `redis` for Redis).
|
152
|
+
- `DB_NAME`: The name of the database to connect to (if applicable).
|
153
|
+
- `DB_USER`: The username for database authentication (if applicable).
|
154
|
+
- `DB_PASS`: The password for database authentication (if applicable).
|
155
|
+
- `DB_HOST`: The hostname or IP address of the database server.
|
156
|
+
- `DB_PORT`: The port number for the database server.
|
157
|
+
- `PROXY_v4`: Specifies an IPv4 proxy to be used for outgoing requests.
|
158
|
+
- `PROXY_V6`: Specifies an IPv6 proxy to be used for outgoing requests.
|
159
|
+
- `SYNOPTIC_GRPC_SERVER_URI`: The URI for the gRPC server endpoint.
|
160
|
+
- `SYNOPTIC_GRPC_TOKEN`: The authentication token for gRPC communication.
|
161
|
+
- `SYNOPTIC_GRPC_ID`: A unique identifier for gRPC messages.
|
162
|
+
- `SYNOPTIC_API_KEY`: The API key required for accessing the Synoptic API via websockets.
|
163
|
+
- `SYNOPTIC_STREAM_ID`: The stream ID for publishing data to the Synoptic API via websockets.
|
164
|
+
- `SYNOPTIC_DISCORD_WEBHOOK`: The webhook URL for sending messages to a Discord channel.
|
165
|
+
|
166
|
+
## Built-in Components
|
167
|
+
|
168
|
+
ZenX provides several pre-built components to streamline common web scraping tasks. These include HTTP clients for making requests, databases for data storage, and pipelines for processing scraped items.
|
169
|
+
|
170
|
+
### HTTP Clients
|
171
|
+
|
172
|
+
ZenX offers the following HTTP client out-of-the-box:
|
173
|
+
|
174
|
+
- **`curl_cffi`**: A client that leverages `curl-cffi` for making HTTP requests. This client is capable of impersonating various browsers, which can be useful for avoiding detection or blocks.
|
175
|
+
|
176
|
+
#### Session Usage
|
177
|
+
|
178
|
+
The client supports session management to maintain state (like cookies) across multiple requests and improve performance by reusing connections.
|
179
|
+
|
180
|
+
- **Enabling Sessions**: To use sessions for a request within your spider, set the `use_sessions=True` parameter when making a request through `self.client`:
|
181
|
+
|
182
|
+
```python
|
183
|
+
response = await self.client.get("https://example.com/page1", use_sessions=True)
|
184
|
+
```
|
185
|
+
|
186
|
+
- **Session Pool**: The `curl_cffi` client maintains a pool of sessions. The size of this pool is controlled by the `SESSION_POOL_SIZE` environment variable. By default, each session in the pool will have a randomly assigned browser fingerprint for impersonation.
|
187
|
+
|
188
|
+
- **Important Note**: If `use_sessions` is `False` (the default), each request will be made with a new, independent session and a randomly chosen browser fingerprint. This is suitable for single, isolated requests.
|
189
|
+
|
190
|
+
### Databases
|
191
|
+
|
192
|
+
ZenX supports the following database backends:
|
193
|
+
|
194
|
+
- **`memory`**: An in-memory database that utilizes a deque for temporary data storage. This is ideal for development, testing, and scenarios where persistence is not required.
|
195
|
+
- **`redis`**: A persistent database solution using Redis. This is suitable for production environments.
|
196
|
+
- **Dependencies**: `redis` (install with `pip install 'zenx[redis]'`)
|
197
|
+
- **Required Settings**: `DB_HOST`, `DB_PORT`, `DB_PASS`
|
198
|
+
|
199
|
+
### Pipelines
|
200
|
+
|
201
|
+
Pipelines process items after they are scraped by spiders. ZenX includes the following built-in pipelines:
|
202
|
+
|
203
|
+
- **`preprocess`**: This pipeline pre-processes items before they proceed to other pipelines. It handles deduplication: if an item contains an `_id` field, the `preprocess` pipeline uses this field to check if the item has already been processed. If a duplicate `_id` is found, the item is dropped.
|
204
|
+
- **`synoptic_websocket`**: A pipeline designed to send processed items to the Synoptic API via a WebSocket connection. It requires the item to contain a `_content` field, which should hold the pre-formatted message intended for the Synoptic server.
|
205
|
+
- **Dependencies**: `websockets` (install with `pip install 'zenx[websocket]'`)
|
206
|
+
- **Required Settings**: `SYNOPTIC_API_KEY`, `SYNOPTIC_STREAM_ID`
|
207
|
+
- **`synoptic_grpc`**: This pipeline sends items to the Synoptic API using a gRPC connection.
|
208
|
+
- **Dependencies**: `grpcio` (install with `pip install 'zenx[grpc]'`)
|
209
|
+
- **Required Settings**: `SYNOPTIC_GRPC_SERVER_URI`, `SYNOPTIC_GRPC_TOKEN`, `SYNOPTIC_GRPC_ID`
|
210
|
+
- **`synoptic_discord`**: A pipeline for sending items to a Discord webhook. This is useful for notifications or logging scraped data to a Discord channel.
|
211
|
+
- **Dependencies**: `httpx` (install with `pip install 'zenx[discord]'`)
|
212
|
+
- **Required Settings**: `SYNOPTIC_DISCORD_WEBHOOK`
|
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import
|
2
|
+
from parsel import Selector, SelectorList
|
3
|
+
from functools import cached_property
|
3
4
|
import random
|
4
5
|
from curl_cffi.requests.impersonate import BrowserTypeLiteral
|
5
6
|
from curl_cffi import AsyncSession, Response as CurlResponse
|
@@ -7,7 +8,7 @@ from abc import ABC, abstractmethod
|
|
7
8
|
import asyncio
|
8
9
|
from dataclasses import dataclass
|
9
10
|
from typing import Any, ClassVar, Tuple, Type, Dict, get_args
|
10
|
-
import
|
11
|
+
import orjson
|
11
12
|
from structlog import BoundLogger
|
12
13
|
|
13
14
|
from zenx.settings import Settings
|
@@ -25,11 +26,14 @@ class Response:
|
|
25
26
|
latency_ms: int
|
26
27
|
|
27
28
|
def json(self) -> Any:
|
28
|
-
return
|
29
|
+
return orjson.loads(self.text)
|
29
30
|
|
30
|
-
|
31
|
-
|
32
|
-
return
|
31
|
+
@cached_property
|
32
|
+
def selector(self) -> Selector:
|
33
|
+
return Selector(self.text)
|
34
|
+
|
35
|
+
def xpath(self, query: str) -> SelectorList[Selector]:
|
36
|
+
return self.selector.xpath(query)
|
33
37
|
|
34
38
|
|
35
39
|
class HttpClient(ABC):
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from typing import Dict
|
2
|
+
import json
|
2
3
|
from structlog import BoundLogger
|
3
4
|
|
4
5
|
from zenx.pipelines.base import Pipeline
|
@@ -36,7 +37,9 @@ try:
|
|
36
37
|
async def _process(self, item: Dict) -> None:
|
37
38
|
try:
|
38
39
|
_item = {k: v for k, v in item.items() if not k.startswith("_")}
|
39
|
-
|
40
|
+
message_content = f"```json\n{json.dumps(_item, indent=4)}\n```"
|
41
|
+
payload = {"content": message_content}
|
42
|
+
await self._client.post(self._uri, json=payload)
|
40
43
|
except Exception as e:
|
41
44
|
self.logger.error("processing", exception=str(e), id=item.get("_id"), pipeline=self.name)
|
42
45
|
|
@@ -44,6 +47,7 @@ try:
|
|
44
47
|
async def close(self) -> None:
|
45
48
|
if hasattr(self, "_client") and self._client:
|
46
49
|
await self._client.aclose()
|
50
|
+
|
47
51
|
except ModuleNotFoundError:
|
48
52
|
# proxy pattern
|
49
53
|
class SynopticDiscordPipeline(Pipeline):
|
@@ -15,14 +15,14 @@ try:
|
|
15
15
|
|
16
16
|
class SynopticGoogleRPCPipeline(Pipeline): # type: ignore[reportRedeclaration]
|
17
17
|
name = "synoptic_grpc"
|
18
|
-
required_settings = ["
|
18
|
+
required_settings = ["SYNOPTIC_GRPC_SERVER_URI", "SYNOPTIC_GRPC_TOKEN", "SYNOPTIC_GRPC_ID"]
|
19
19
|
|
20
20
|
|
21
21
|
def __init__(self, logger: structlog.BoundLogger, db: DBClient, settings: Settings) -> None:
|
22
22
|
super().__init__(logger, db, settings)
|
23
|
-
self._uri = self.settings.
|
24
|
-
self._feed_token = self.settings.
|
25
|
-
self._feed_id = self.settings.
|
23
|
+
self._uri = self.settings.SYNOPTIC_GRPC_SERVER_URI
|
24
|
+
self._feed_token = self.settings.SYNOPTIC_GRPC_TOKEN
|
25
|
+
self._feed_id = self.settings.SYNOPTIC_GRPC_ID
|
26
26
|
self._feed_pb2 = importlib.import_module("zenx.resources.proto.feed_pb2")
|
27
27
|
self._feed_pb2_grpc = importlib.import_module("zenx.resources.proto.feed_pb2_grpc")
|
28
28
|
|
@@ -21,10 +21,9 @@ class Settings(BaseSettings):
|
|
21
21
|
PROXY_v4: str | None = None
|
22
22
|
PROXY_V6: str | None = None
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
GRPC_ID_HEADLINE: str | None = None
|
24
|
+
SYNOPTIC_GRPC_SERVER_URI: str = "ingress.opticfeeds.com"
|
25
|
+
SYNOPTIC_GRPC_TOKEN: str | None = None
|
26
|
+
SYNOPTIC_GRPC_ID: str | None = None
|
28
27
|
|
29
28
|
SYNOPTIC_DISCORD_WEBHOOK: str | None = None
|
30
29
|
|
@@ -1,9 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: zenx
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.7
|
4
4
|
Summary: mini-framework
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Requires-Dist: curl-cffi>=0.12.0
|
7
|
+
Requires-Dist: orjson>=3.11.0
|
7
8
|
Requires-Dist: parsel>=1.10.0
|
8
9
|
Requires-Dist: pebble>=5.1.1
|
9
10
|
Requires-Dist: pydantic>=2.11.7
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|