zenx 0.9.5__tar.gz → 0.9.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {zenx-0.9.5 → zenx-0.9.7}/PKG-INFO +2 -1
  2. zenx-0.9.7/README.md +212 -0
  3. {zenx-0.9.5 → zenx-0.9.7}/pyproject.toml +2 -1
  4. {zenx-0.9.5 → zenx-0.9.7}/zenx/clients/http.py +10 -6
  5. {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/discord.py +5 -1
  6. {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/google_rpc.py +4 -4
  7. {zenx-0.9.5 → zenx-0.9.7}/zenx/settings.py +3 -4
  8. {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/PKG-INFO +2 -1
  9. {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/SOURCES.txt +1 -0
  10. {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/requires.txt +1 -0
  11. {zenx-0.9.5 → zenx-0.9.7}/setup.cfg +0 -0
  12. {zenx-0.9.5 → zenx-0.9.7}/zenx/cli.py +0 -0
  13. {zenx-0.9.5 → zenx-0.9.7}/zenx/clients/__init__.py +0 -0
  14. {zenx-0.9.5 → zenx-0.9.7}/zenx/clients/database.py +0 -0
  15. {zenx-0.9.5 → zenx-0.9.7}/zenx/debug_runner.py +0 -0
  16. {zenx-0.9.5 → zenx-0.9.7}/zenx/discovery.py +0 -0
  17. {zenx-0.9.5 → zenx-0.9.7}/zenx/engine.py +0 -0
  18. {zenx-0.9.5 → zenx-0.9.7}/zenx/exceptions.py +0 -0
  19. {zenx-0.9.5 → zenx-0.9.7}/zenx/logger.py +0 -0
  20. {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/__init__.py +0 -0
  21. {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/base.py +0 -0
  22. {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/manager.py +0 -0
  23. {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/preprocess.py +0 -0
  24. {zenx-0.9.5 → zenx-0.9.7}/zenx/pipelines/websocket.py +0 -0
  25. {zenx-0.9.5 → zenx-0.9.7}/zenx/resources/proto/__init__.py +0 -0
  26. {zenx-0.9.5 → zenx-0.9.7}/zenx/resources/proto/feed_pb2.py +0 -0
  27. {zenx-0.9.5 → zenx-0.9.7}/zenx/resources/proto/feed_pb2_grpc.py +0 -0
  28. {zenx-0.9.5 → zenx-0.9.7}/zenx/spiders/__init__.py +0 -0
  29. {zenx-0.9.5 → zenx-0.9.7}/zenx/spiders/base.py +0 -0
  30. {zenx-0.9.5 → zenx-0.9.7}/zenx/utils.py +0 -0
  31. {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/dependency_links.txt +0 -0
  32. {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/entry_points.txt +0 -0
  33. {zenx-0.9.5 → zenx-0.9.7}/zenx.egg-info/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zenx
3
- Version: 0.9.5
3
+ Version: 0.9.7
4
4
  Summary: mini-framework
5
5
  Requires-Python: >=3.12
6
6
  Requires-Dist: curl-cffi>=0.12.0
7
+ Requires-Dist: orjson>=3.11.0
7
8
  Requires-Dist: parsel>=1.10.0
8
9
  Requires-Dist: pebble>=5.1.1
9
10
  Requires-Dist: pydantic>=2.11.7
zenx-0.9.7/README.md ADDED
@@ -0,0 +1,212 @@
1
+
2
+ # ZenX
3
+
4
+ A fast, efficient and minimal web scraping framework built on top of asyncio and uvloop.
5
+
6
+ ## Table of Contents
7
+
8
+ - [High-Level Overview](#high-level-overview)
9
+ - [Installation](#installation)
10
+ - [Quickstart](#quickstart)
11
+ - [1. Create a new project](#1-create-a-new-project)
12
+ - [2. Define a spider](#2-define-a-spider)
13
+ - [3. Run the spider](#3-run-the-spider)
14
+ - [4. List available spiders](#4-list-available-spiders)
15
+ - [Configuration](#configuration)
16
+ - [Built-in Components](#built-in-components)
17
+ - [HTTP Clients](#http-clients)
18
+ - [Databases](#databases)
19
+ - [Pipelines](#pipelines)
20
+
21
+ ## High-Level Overview
22
+
23
+ The framework is composed of the following key components:
24
+
25
+ - **Spiders**: Responsible for fetching web pages and extracting data. Each spider is a class that implements `zenx.spiders.base.Spider` interface.
26
+
27
+ - **Pipelines**: Process the data extracted by spiders. Each pipeline is a class that implements `zenx.pipelines.base.Pipeline` interface. Pipelines can be used to clean, validate, store or forward data to third party services.
28
+
29
+ - **Engine**: Responsible for managing the life-cylce of spiders.
30
+
31
+ - **CLI**: Command-line interface (CLI) for managing spiders and running crawls.
32
+
33
+ ## Installation
34
+
35
+ To install ZenX, you can use pip:
36
+
37
+ ```bash
38
+ pip install zenx
39
+ ```
40
+
41
+ To install ZenX with all optional dependencies for built-in components (Redis, gRPC, WebSockets, Discord), you can use:
42
+
43
+ ```bash
44
+ pip install 'zenx[all]'
45
+ ```
46
+
47
+ Alternatively, you can install specific optional dependencies:
48
+
49
+ ```bash
50
+ pip install 'zenx[redis]' # For Redis database support
51
+ pip install 'zenx[grpc]' # For gRPC pipeline support
52
+ pip install 'zenx[websocket]' # For WebSocket pipeline support
53
+ pip install 'zenx[discord]' # For Discord pipeline support
54
+ ```
55
+
56
+ ## Quickstart
57
+
58
+ To get started with ZenX, you can create a new project and define a spider.
59
+
60
+ ### 1. Create a new project
61
+
62
+ ```bash
63
+ zenx startproject myproject
64
+ ```
65
+
66
+ This will create a new directory called `myproject` with the following structure:
67
+
68
+ ```
69
+ myproject/
70
+ └── spiders/
71
+ └── __init__.py
72
+ ```
73
+
74
+ ### 2. Define a spider
75
+
76
+ Create a new file in the `myproject/spiders` directory (e.g., `myproject/spiders/myspider.py`) and define a spider:
77
+
78
+ ```python
79
+ from zenx.spiders.base import Spider
80
+ from zenx.clients.http import Response
81
+
82
+ class MySpider(Spider):
83
+ name = "myspider"
84
+ pipelines = ["preprocess"] # multiple pipelines can be passed here
85
+
86
+ async def crawl(self) -> None:
87
+ response = await self.client.get("https://example.com")
88
+ await self.process_response(response)
89
+
90
+ async def process_response(self, response: Response) -> None:
91
+ item = self.parse(response)
92
+ # Asynchronously handle the pipelines processing
93
+ self.create_task(self.pm.process_item(item, self.name))
94
+
95
+ def parse(self, response: Response) -> Dict:
96
+ return {
97
+ "_id": 1,
98
+ "title": response.xpath("//h1/text()").get(),
99
+ }
100
+ ```
101
+
102
+ ### 3. Run the spider
103
+
104
+ To run the spider, use the `crawl` command:
105
+
106
+ ```bash
107
+ zenx crawl myspider
108
+ ```
109
+
110
+ This will run the `myspider` spider and print the extracted data to the console.
111
+
112
+ #### Running Multiple Spiders
113
+
114
+ You can run multiple spiders at once by passing their names to the `crawl` command:
115
+
116
+ ```bash
117
+ zenx crawl spider1 spider2
118
+ ```
119
+
120
+ To run all available spiders, use the `all` keyword:
121
+
122
+ ```bash
123
+ zenx crawl all
124
+ ```
125
+
126
+ #### Forever Mode
127
+
128
+ ZenX can run spiders continuously in "forever mode". This is useful for long-running spiders that need to monitor websites for changes. To enable forever mode, use the `--forever` flag:
129
+
130
+ ```bash
131
+ zenx crawl myspider --forever
132
+ ```
133
+
134
+ ### 4. List available spiders
135
+
136
+ To see a list of available spiders, use the `list` command:
137
+
138
+ ```bash
139
+ zenx list
140
+ ```
141
+
142
+ ## Configuration
143
+
144
+ ZenX allows for flexible configuration through environment variables or a `.env` file. Below are the key settings you can adjust to customize its behavior:
145
+
146
+ - `APP_ENV`: Specifies the application environment (e.g., `dev`, `prod`).
147
+ - `SESSION_POOL_SIZE`: Defines the number of sessions in session pool.
148
+ - `MAX_SCRAPE_DELAY`: Sets the maximum allowed delay (in seconds) between an item's publication and its scraping.
149
+ - `DQ_MAX_SIZE`: Configures the maximum size of the deque used by the in-memory database.
150
+ - `REDIS_RECORD_EXPIRY_SECONDS`: Determines how long (in seconds) records are stored in Redis before expiring.
151
+ - `DB_TYPE`: Selects the database backend to use (`memory` for in-memory, `redis` for Redis).
152
+ - `DB_NAME`: The name of the database to connect to (if applicable).
153
+ - `DB_USER`: The username for database authentication (if applicable).
154
+ - `DB_PASS`: The password for database authentication (if applicable).
155
+ - `DB_HOST`: The hostname or IP address of the database server.
156
+ - `DB_PORT`: The port number for the database server.
157
+ - `PROXY_v4`: Specifies an IPv4 proxy to be used for outgoing requests.
158
+ - `PROXY_V6`: Specifies an IPv6 proxy to be used for outgoing requests.
159
+ - `SYNOPTIC_GRPC_SERVER_URI`: The URI for the gRPC server endpoint.
160
+ - `SYNOPTIC_GRPC_TOKEN`: The authentication token for gRPC communication.
161
+ - `SYNOPTIC_GRPC_ID`: A unique identifier for gRPC messages.
162
+ - `SYNOPTIC_API_KEY`: The API key required for accessing the Synoptic API via websockets.
163
+ - `SYNOPTIC_STREAM_ID`: The stream ID for publishing data to the Synoptic API via websockets.
164
+ - `SYNOPTIC_DISCORD_WEBHOOK`: The webhook URL for sending messages to a Discord channel.
165
+
166
+ ## Built-in Components
167
+
168
+ ZenX provides several pre-built components to streamline common web scraping tasks. These include HTTP clients for making requests, databases for data storage, and pipelines for processing scraped items.
169
+
170
+ ### HTTP Clients
171
+
172
+ ZenX offers the following HTTP client out-of-the-box:
173
+
174
+ - **`curl_cffi`**: A client that leverages `curl-cffi` for making HTTP requests. This client is capable of impersonating various browsers, which can be useful for avoiding detection or blocks.
175
+
176
+ #### Session Usage
177
+
178
+ The client supports session management to maintain state (like cookies) across multiple requests and improve performance by reusing connections.
179
+
180
+ - **Enabling Sessions**: To use sessions for a request within your spider, set the `use_sessions=True` parameter when making a request through `self.client`:
181
+
182
+ ```python
183
+ response = await self.client.get("https://example.com/page1", use_sessions=True)
184
+ ```
185
+
186
+ - **Session Pool**: The `curl_cffi` client maintains a pool of sessions. The size of this pool is controlled by the `SESSION_POOL_SIZE` environment variable. By default, each session in the pool will have a randomly assigned browser fingerprint for impersonation.
187
+
188
+ - **Important Note**: If `use_sessions` is `False` (the default), each request will be made with a new, independent session and a randomly chosen browser fingerprint. This is suitable for single, isolated requests.
189
+
190
+ ### Databases
191
+
192
+ ZenX supports the following database backends:
193
+
194
+ - **`memory`**: An in-memory database that utilizes a deque for temporary data storage. This is ideal for development, testing, and scenarios where persistence is not required.
195
+ - **`redis`**: A persistent database solution using Redis. This is suitable for production environments.
196
+ - **Dependencies**: `redis` (install with `pip install 'zenx[redis]'`)
197
+ - **Required Settings**: `DB_HOST`, `DB_PORT`, `DB_PASS`
198
+
199
+ ### Pipelines
200
+
201
+ Pipelines process items after they are scraped by spiders. ZenX includes the following built-in pipelines:
202
+
203
+ - **`preprocess`**: This pipeline pre-processes items before they proceed to other pipelines. It handles deduplication: if an item contains an `_id` field, the `preprocess` pipeline uses this field to check if the item has already been processed. If a duplicate `_id` is found, the item is dropped.
204
+ - **`synoptic_websocket`**: A pipeline designed to send processed items to the Synoptic API via a WebSocket connection. It requires the item to contain a `_content` field, which should hold the pre-formatted message intended for the Synoptic server.
205
+ - **Dependencies**: `websockets` (install with `pip install 'zenx[websocket]'`)
206
+ - **Required Settings**: `SYNOPTIC_API_KEY`, `SYNOPTIC_STREAM_ID`
207
+ - **`synoptic_grpc`**: This pipeline sends items to the Synoptic API using a gRPC connection.
208
+ - **Dependencies**: `grpcio` (install with `pip install 'zenx[grpc]'`)
209
+ - **Required Settings**: `SYNOPTIC_GRPC_SERVER_URI`, `SYNOPTIC_GRPC_TOKEN`, `SYNOPTIC_GRPC_ID`
210
+ - **`synoptic_discord`**: A pipeline for sending items to a Discord webhook. This is useful for notifications or logging scraped data to a Discord channel.
211
+ - **Dependencies**: `httpx` (install with `pip install 'zenx[discord]'`)
212
+ - **Required Settings**: `SYNOPTIC_DISCORD_WEBHOOK`
@@ -1,10 +1,11 @@
1
1
  [project]
2
2
  name = "zenx"
3
- version = "0.9.5"
3
+ version = "0.9.7"
4
4
  description = "mini-framework"
5
5
  requires-python = ">=3.12"
6
6
  dependencies = [
7
7
  "curl-cffi>=0.12.0",
8
+ "orjson>=3.11.0",
8
9
  "parsel>=1.10.0",
9
10
  "pebble>=5.1.1",
10
11
  "pydantic>=2.11.7",
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
- import parsel
2
+ from parsel import Selector, SelectorList
3
+ from functools import cached_property
3
4
  import random
4
5
  from curl_cffi.requests.impersonate import BrowserTypeLiteral
5
6
  from curl_cffi import AsyncSession, Response as CurlResponse
@@ -7,7 +8,7 @@ from abc import ABC, abstractmethod
7
8
  import asyncio
8
9
  from dataclasses import dataclass
9
10
  from typing import Any, ClassVar, Tuple, Type, Dict, get_args
10
- import json
11
+ import orjson
11
12
  from structlog import BoundLogger
12
13
 
13
14
  from zenx.settings import Settings
@@ -25,11 +26,14 @@ class Response:
25
26
  latency_ms: int
26
27
 
27
28
  def json(self) -> Any:
28
- return json.loads(self.text)
29
+ return orjson.loads(self.text)
29
30
 
30
- def selector(self) -> parsel.Selector:
31
- sel = parsel.Selector(self.text)
32
- return sel
31
+ @cached_property
32
+ def selector(self) -> Selector:
33
+ return Selector(self.text)
34
+
35
+ def xpath(self, query: str) -> SelectorList[Selector]:
36
+ return self.selector.xpath(query)
33
37
 
34
38
 
35
39
  class HttpClient(ABC):
@@ -1,4 +1,5 @@
1
1
  from typing import Dict
2
+ import json
2
3
  from structlog import BoundLogger
3
4
 
4
5
  from zenx.pipelines.base import Pipeline
@@ -36,7 +37,9 @@ try:
36
37
  async def _process(self, item: Dict) -> None:
37
38
  try:
38
39
  _item = {k: v for k, v in item.items() if not k.startswith("_")}
39
- await self._client.post(self._uri, json=_item)
40
+ message_content = f"```json\n{json.dumps(_item, indent=4)}\n```"
41
+ payload = {"content": message_content}
42
+ await self._client.post(self._uri, json=payload)
40
43
  except Exception as e:
41
44
  self.logger.error("processing", exception=str(e), id=item.get("_id"), pipeline=self.name)
42
45
 
@@ -44,6 +47,7 @@ try:
44
47
  async def close(self) -> None:
45
48
  if hasattr(self, "_client") and self._client:
46
49
  await self._client.aclose()
50
+
47
51
  except ModuleNotFoundError:
48
52
  # proxy pattern
49
53
  class SynopticDiscordPipeline(Pipeline):
@@ -15,14 +15,14 @@ try:
15
15
 
16
16
  class SynopticGoogleRPCPipeline(Pipeline): # type: ignore[reportRedeclaration]
17
17
  name = "synoptic_grpc"
18
- required_settings = ["GRPC_SERVER_URI", "GRPC_TOKEN", "GRPC_ID"]
18
+ required_settings = ["SYNOPTIC_GRPC_SERVER_URI", "SYNOPTIC_GRPC_TOKEN", "SYNOPTIC_GRPC_ID"]
19
19
 
20
20
 
21
21
  def __init__(self, logger: structlog.BoundLogger, db: DBClient, settings: Settings) -> None:
22
22
  super().__init__(logger, db, settings)
23
- self._uri = self.settings.GRPC_SERVER_URI
24
- self._feed_token = self.settings.GRPC_TOKEN
25
- self._feed_id = self.settings.GRPC_ID
23
+ self._uri = self.settings.SYNOPTIC_GRPC_SERVER_URI
24
+ self._feed_token = self.settings.SYNOPTIC_GRPC_TOKEN
25
+ self._feed_id = self.settings.SYNOPTIC_GRPC_ID
26
26
  self._feed_pb2 = importlib.import_module("zenx.resources.proto.feed_pb2")
27
27
  self._feed_pb2_grpc = importlib.import_module("zenx.resources.proto.feed_pb2_grpc")
28
28
 
@@ -21,10 +21,9 @@ class Settings(BaseSettings):
21
21
  PROXY_v4: str | None = None
22
22
  PROXY_V6: str | None = None
23
23
 
24
- GRPC_SERVER_URI: str = "ingress.opticfeeds.com"
25
- GRPC_TOKEN: str | None = None
26
- GRPC_ID: str | None = None
27
- GRPC_ID_HEADLINE: str | None = None
24
+ SYNOPTIC_GRPC_SERVER_URI: str = "ingress.opticfeeds.com"
25
+ SYNOPTIC_GRPC_TOKEN: str | None = None
26
+ SYNOPTIC_GRPC_ID: str | None = None
28
27
 
29
28
  SYNOPTIC_DISCORD_WEBHOOK: str | None = None
30
29
 
@@ -1,9 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: zenx
3
- Version: 0.9.5
3
+ Version: 0.9.7
4
4
  Summary: mini-framework
5
5
  Requires-Python: >=3.12
6
6
  Requires-Dist: curl-cffi>=0.12.0
7
+ Requires-Dist: orjson>=3.11.0
7
8
  Requires-Dist: parsel>=1.10.0
8
9
  Requires-Dist: pebble>=5.1.1
9
10
  Requires-Dist: pydantic>=2.11.7
@@ -1,3 +1,4 @@
1
+ README.md
1
2
  pyproject.toml
2
3
  zenx/cli.py
3
4
  zenx/debug_runner.py
@@ -1,4 +1,5 @@
1
1
  curl-cffi>=0.12.0
2
+ orjson>=3.11.0
2
3
  parsel>=1.10.0
3
4
  pebble>=5.1.1
4
5
  pydantic>=2.11.7
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes