webcrawlerapi 2.0.3__tar.gz → 2.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.3
3
+ Version: 2.0.4
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="webcrawlerapi",
5
- version="2.0.3",
5
+ version="2.0.4",
6
6
  packages=find_packages(),
7
7
  install_requires=[
8
8
  "requests>=2.25.0",
@@ -12,13 +12,15 @@ from .models import (
12
12
  Action,
13
13
  )
14
14
 
15
+ CRAWLER_VERSION = "v1"
16
+ SCRAPER_VERSION = "v2"
15
17
 
16
18
  class WebCrawlerAPI:
17
19
  """Python SDK for WebCrawler API."""
18
20
 
19
21
  DEFAULT_POLL_DELAY_SECONDS = 5
20
22
 
21
- def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com", version: str = "v1"):
23
+ def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com"):
22
24
  """
23
25
  Initialize the WebCrawler API client.
24
26
 
@@ -29,7 +31,6 @@ class WebCrawlerAPI:
29
31
  """
30
32
  self.api_key = api_key
31
33
  self.base_url = base_url.rstrip('/')
32
- self.version = version
33
34
  self.session = requests.Session()
34
35
  self.session.headers.update({
35
36
  'Authorization': f'Bearer {api_key}',
@@ -86,7 +87,7 @@ class WebCrawlerAPI:
86
87
  payload["actions"] = [vars(action) for action in action_list]
87
88
 
88
89
  response = self.session.post(
89
- urljoin(self.base_url, f"/{self.version}/crawl"),
90
+ urljoin(self.base_url, f"/{CRAWLER_VERSION}/crawl"),
90
91
  json=payload
91
92
  )
92
93
  response.raise_for_status()
@@ -106,7 +107,7 @@ class WebCrawlerAPI:
106
107
  requests.exceptions.RequestException: If the API request fails
107
108
  """
108
109
  response = self.session.get(
109
- urljoin(self.base_url, f"/{self.version}/job/{job_id}")
110
+ urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}")
110
111
  )
111
112
  response.raise_for_status()
112
113
  return Job(response.json())
@@ -126,7 +127,7 @@ class WebCrawlerAPI:
126
127
  requests.exceptions.RequestException: If the API request fails
127
128
  """
128
129
  response = self.session.put(
129
- urljoin(self.base_url, f"/{self.version}/job/{job_id}/cancel")
130
+ urljoin(self.base_url, f"/{CRAWLER_VERSION}/job/{job_id}/cancel")
130
131
  )
131
132
  response.raise_for_status()
132
133
  return response.json()
@@ -246,7 +247,7 @@ class WebCrawlerAPI:
246
247
  payload["actions"] = [vars(action) for action in action_list]
247
248
 
248
249
  response = self.session.post(
249
- urljoin(self.base_url, f"/{self.version}/scrape?async=true"),
250
+ urljoin(self.base_url, f"/{SCRAPER_VERSION}/scrape?async=true"),
250
251
  json=payload
251
252
  )
252
253
 
@@ -279,7 +280,7 @@ class WebCrawlerAPI:
279
280
  requests.exceptions.RequestException: If the API request fails
280
281
  """
281
282
  response = self.session.get(
282
- urljoin(self.base_url, f"/{self.version}/scrape/{scrape_id}")
283
+ urljoin(self.base_url, f"/{SCRAPER_VERSION}/scrape/{scrape_id}")
283
284
  )
284
285
 
285
286
  response.raise_for_status()
@@ -1,34 +1,6 @@
1
1
  from typing import Optional, Dict, Any, List
2
2
  from datetime import datetime
3
3
  from dataclasses import dataclass
4
- import re
5
-
6
-
7
- def parse_datetime(datetime_str: str) -> datetime:
8
- """
9
- Parse datetime string from API response, handling various microsecond formats.
10
-
11
- Args:
12
- datetime_str (str): Datetime string from API
13
-
14
- Returns:
15
- datetime: Parsed datetime object
16
- """
17
- # Replace 'Z' with '+00:00' for timezone
18
- datetime_str = datetime_str.replace('Z', '+00:00')
19
-
20
- # Handle microseconds - pad to 6 digits or remove if present
21
- # Pattern matches: YYYY-MM-DDTHH:MM:SS.microseconds followed by timezone or end
22
- pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)(.*)'
23
- match = re.match(pattern, datetime_str)
24
-
25
- if match:
26
- base_time, microseconds, timezone_part = match.groups()
27
- # Pad microseconds to 6 digits or truncate if longer
28
- microseconds = microseconds.ljust(6, '0')[:6]
29
- datetime_str = f"{base_time}.{microseconds}{timezone_part}"
30
-
31
- return datetime.fromisoformat(datetime_str)
32
4
 
33
5
 
34
6
  @dataclass
@@ -106,8 +78,8 @@ class JobItem:
106
78
  self.page_status_code: int = data["page_status_code"]
107
79
  self.status: str = data["status"]
108
80
  self.title: str = data["title"]
109
- self.created_at: datetime = parse_datetime(data["created_at"])
110
- self.updated_at: datetime = parse_datetime(data["updated_at"])
81
+ self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
82
+ self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
111
83
  self.cost: int = data.get("cost", 0)
112
84
  self.referred_url: Optional[str] = data.get("referred_url")
113
85
  self.last_error: Optional[str] = data.get("last_error")
@@ -185,17 +157,17 @@ class Job:
185
157
  self.scrape_type: str = data["scrape_type"]
186
158
  self.whitelist_regexp: Optional[str] = data.get("whitelist_regexp")
187
159
  self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
188
- self.allow_subdomains: bool = data.get("allow_subdomains", False)
160
+ self.allow_subdomains: bool = data["allow_subdomains"]
189
161
  self.items_limit: int = data["items_limit"]
190
- self.created_at: datetime = parse_datetime(data["created_at"])
191
- self.updated_at: datetime = parse_datetime(data["updated_at"])
162
+ self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
163
+ self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
192
164
  self.webhook_url: Optional[str] = data.get("webhook_url")
193
165
  self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
194
166
 
195
167
  # Optional fields
196
168
  self.finished_at: Optional[datetime] = None
197
169
  if data.get("finished_at"):
198
- self.finished_at = parse_datetime(data["finished_at"])
170
+ self.finished_at = datetime.fromisoformat(data["finished_at"].replace('Z', '+00:00'))
199
171
 
200
172
  self.webhook_status: Optional[str] = data.get("webhook_status")
201
173
  self.webhook_error: Optional[str] = data.get("webhook_error")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.3
3
+ Version: 2.0.4
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
File without changes
File without changes