webcrawlerapi 2.0.3__tar.gz → 2.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/PKG-INFO +1 -1
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/setup.py +1 -1
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/webcrawlerapi/client.py +8 -7
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/webcrawlerapi/models.py +6 -34
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/PKG-INFO +1 -1
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/README.md +0 -0
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/setup.cfg +0 -0
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/webcrawlerapi/__init__.py +0 -0
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/SOURCES.txt +0 -0
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/requires.txt +0 -0
- {webcrawlerapi-2.0.3 → webcrawlerapi-2.0.4}/webcrawlerapi.egg-info/top_level.txt +0 -0
|
@@ -12,13 +12,15 @@ from .models import (
|
|
|
12
12
|
Action,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
+
CRAWLER_VERSION = "v1"
|
|
16
|
+
SCRAPER_VERSION = "v2"
|
|
15
17
|
|
|
16
18
|
class WebCrawlerAPI:
|
|
17
19
|
"""Python SDK for WebCrawler API."""
|
|
18
20
|
|
|
19
21
|
DEFAULT_POLL_DELAY_SECONDS = 5
|
|
20
22
|
|
|
21
|
-
def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com"
|
|
23
|
+
def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com"):
|
|
22
24
|
"""
|
|
23
25
|
Initialize the WebCrawler API client.
|
|
24
26
|
|
|
@@ -29,7 +31,6 @@ class WebCrawlerAPI:
|
|
|
29
31
|
"""
|
|
30
32
|
self.api_key = api_key
|
|
31
33
|
self.base_url = base_url.rstrip('/')
|
|
32
|
-
self.version = version
|
|
33
34
|
self.session = requests.Session()
|
|
34
35
|
self.session.headers.update({
|
|
35
36
|
'Authorization': f'Bearer {api_key}',
|
|
@@ -86,7 +87,7 @@ class WebCrawlerAPI:
|
|
|
86
87
|
payload["actions"] = [vars(action) for action in action_list]
|
|
87
88
|
|
|
88
89
|
response = self.session.post(
|
|
89
|
-
urljoin(self.base_url, f"/{
|
|
90
|
+
urljoin(self.base_url, f"/{CRAWLER_VERSION}/crawl"),
|
|
90
91
|
json=payload
|
|
91
92
|
)
|
|
92
93
|
response.raise_for_status()
|
|
@@ -106,7 +107,7 @@ class WebCrawlerAPI:
|
|
|
106
107
|
requests.exceptions.RequestException: If the API request fails
|
|
107
108
|
"""
|
|
108
109
|
response = self.session.get(
|
|
109
|
-
urljoin(self.base_url, f"/{
|
|
110
|
+
urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}")
|
|
110
111
|
)
|
|
111
112
|
response.raise_for_status()
|
|
112
113
|
return Job(response.json())
|
|
@@ -126,7 +127,7 @@ class WebCrawlerAPI:
|
|
|
126
127
|
requests.exceptions.RequestException: If the API request fails
|
|
127
128
|
"""
|
|
128
129
|
response = self.session.put(
|
|
129
|
-
urljoin(self.base_url, f"/{
|
|
130
|
+
urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/cancel")
|
|
130
131
|
)
|
|
131
132
|
response.raise_for_status()
|
|
132
133
|
return response.json()
|
|
@@ -246,7 +247,7 @@ class WebCrawlerAPI:
|
|
|
246
247
|
payload["actions"] = [vars(action) for action in action_list]
|
|
247
248
|
|
|
248
249
|
response = self.session.post(
|
|
249
|
-
urljoin(self.base_url, f"/{
|
|
250
|
+
urljoin(self.base_url, f"/{SCRAPER_VERSION}/scrape?async=true"),
|
|
250
251
|
json=payload
|
|
251
252
|
)
|
|
252
253
|
|
|
@@ -279,7 +280,7 @@ class WebCrawlerAPI:
|
|
|
279
280
|
requests.exceptions.RequestException: If the API request fails
|
|
280
281
|
"""
|
|
281
282
|
response = self.session.get(
|
|
282
|
-
urljoin(self.base_url, f"/{
|
|
283
|
+
urljoin(self.base_url, f"/{SCRAPER_VERSION}/scrape/{scrape_id}")
|
|
283
284
|
)
|
|
284
285
|
|
|
285
286
|
response.raise_for_status()
|
|
@@ -1,34 +1,6 @@
|
|
|
1
1
|
from typing import Optional, Dict, Any, List
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
import re
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def parse_datetime(datetime_str: str) -> datetime:
|
|
8
|
-
"""
|
|
9
|
-
Parse datetime string from API response, handling various microsecond formats.
|
|
10
|
-
|
|
11
|
-
Args:
|
|
12
|
-
datetime_str (str): Datetime string from API
|
|
13
|
-
|
|
14
|
-
Returns:
|
|
15
|
-
datetime: Parsed datetime object
|
|
16
|
-
"""
|
|
17
|
-
# Replace 'Z' with '+00:00' for timezone
|
|
18
|
-
datetime_str = datetime_str.replace('Z', '+00:00')
|
|
19
|
-
|
|
20
|
-
# Handle microseconds - pad to 6 digits or remove if present
|
|
21
|
-
# Pattern matches: YYYY-MM-DDTHH:MM:SS.microseconds followed by timezone or end
|
|
22
|
-
pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)(.*)'
|
|
23
|
-
match = re.match(pattern, datetime_str)
|
|
24
|
-
|
|
25
|
-
if match:
|
|
26
|
-
base_time, microseconds, timezone_part = match.groups()
|
|
27
|
-
# Pad microseconds to 6 digits or truncate if longer
|
|
28
|
-
microseconds = microseconds.ljust(6, '0')[:6]
|
|
29
|
-
datetime_str = f"{base_time}.{microseconds}{timezone_part}"
|
|
30
|
-
|
|
31
|
-
return datetime.fromisoformat(datetime_str)
|
|
32
4
|
|
|
33
5
|
|
|
34
6
|
@dataclass
|
|
@@ -106,8 +78,8 @@ class JobItem:
|
|
|
106
78
|
self.page_status_code: int = data["page_status_code"]
|
|
107
79
|
self.status: str = data["status"]
|
|
108
80
|
self.title: str = data["title"]
|
|
109
|
-
self.created_at: datetime =
|
|
110
|
-
self.updated_at: datetime =
|
|
81
|
+
self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
|
|
82
|
+
self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
|
|
111
83
|
self.cost: int = data.get("cost", 0)
|
|
112
84
|
self.referred_url: Optional[str] = data.get("referred_url")
|
|
113
85
|
self.last_error: Optional[str] = data.get("last_error")
|
|
@@ -185,17 +157,17 @@ class Job:
|
|
|
185
157
|
self.scrape_type: str = data["scrape_type"]
|
|
186
158
|
self.whitelist_regexp: Optional[str] = data.get("whitelist_regexp")
|
|
187
159
|
self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
|
|
188
|
-
self.allow_subdomains: bool = data
|
|
160
|
+
self.allow_subdomains: bool = data["allow_subdomains"]
|
|
189
161
|
self.items_limit: int = data["items_limit"]
|
|
190
|
-
self.created_at: datetime =
|
|
191
|
-
self.updated_at: datetime =
|
|
162
|
+
self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
|
|
163
|
+
self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
|
|
192
164
|
self.webhook_url: Optional[str] = data.get("webhook_url")
|
|
193
165
|
self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
|
|
194
166
|
|
|
195
167
|
# Optional fields
|
|
196
168
|
self.finished_at: Optional[datetime] = None
|
|
197
169
|
if data.get("finished_at"):
|
|
198
|
-
self.finished_at =
|
|
170
|
+
self.finished_at = datetime.fromisoformat(data["finished_at"].replace('Z', '+00:00'))
|
|
199
171
|
|
|
200
172
|
self.webhook_status: Optional[str] = data.get("webhook_status")
|
|
201
173
|
self.webhook_error: Optional[str] = data.get("webhook_error")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|