webcrawlerapi 2.0.4__tar.gz → 2.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/PKG-INFO +3 -3
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/README.md +2 -2
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/setup.py +1 -1
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/webcrawlerapi/models.py +34 -6
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/webcrawlerapi.egg-info/PKG-INFO +3 -3
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/setup.cfg +0 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/webcrawlerapi/__init__.py +0 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/webcrawlerapi/client.py +0 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/webcrawlerapi.egg-info/SOURCES.txt +0 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/webcrawlerapi.egg-info/requires.txt +0 -0
- {webcrawlerapi-2.0.4 → webcrawlerapi-2.0.5}/webcrawlerapi.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: webcrawlerapi
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.5
|
|
4
4
|
Summary: Python SDK for WebCrawler API
|
|
5
5
|
Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
|
|
6
6
|
Author: Andrew
|
|
@@ -101,11 +101,11 @@ print(f"Cancellation response: {cancel_response['message']}")
|
|
|
101
101
|
```
|
|
102
102
|
|
|
103
103
|
### Scraping
|
|
104
|
-
|
|
104
|
+
Check a working code example of [scraping](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping) and [scraping with a prompt](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping_prompt)
|
|
105
105
|
```python
|
|
106
106
|
# Returns structured data directly
|
|
107
107
|
response = crawler.scrape(
|
|
108
|
-
|
|
108
|
+
url="https://webcrawlerapi.com"
|
|
109
109
|
)
|
|
110
110
|
if response.success:
|
|
111
111
|
print(response.markdown)
|
|
@@ -80,11 +80,11 @@ print(f"Cancellation response: {cancel_response['message']}")
|
|
|
80
80
|
```
|
|
81
81
|
|
|
82
82
|
### Scraping
|
|
83
|
-
|
|
83
|
+
Check a working code example of [scraping](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping) and [scraping with a prompt](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping_prompt)
|
|
84
84
|
```python
|
|
85
85
|
# Returns structured data directly
|
|
86
86
|
response = crawler.scrape(
|
|
87
|
-
|
|
87
|
+
url="https://webcrawlerapi.com"
|
|
88
88
|
)
|
|
89
89
|
if response.success:
|
|
90
90
|
print(response.markdown)
|
|
@@ -1,6 +1,34 @@
|
|
|
1
1
|
from typing import Optional, Dict, Any, List
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_datetime(datetime_str: str) -> datetime:
|
|
8
|
+
"""
|
|
9
|
+
Parse datetime string from API response, handling various microsecond formats.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
datetime_str (str): Datetime string from API
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
datetime: Parsed datetime object
|
|
16
|
+
"""
|
|
17
|
+
# Replace 'Z' with '+00:00' for timezone
|
|
18
|
+
datetime_str = datetime_str.replace('Z', '+00:00')
|
|
19
|
+
|
|
20
|
+
# Handle microseconds - pad to 6 digits or remove if present
|
|
21
|
+
# Pattern matches: YYYY-MM-DDTHH:MM:SS.microseconds followed by timezone or end
|
|
22
|
+
pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)(.*)'
|
|
23
|
+
match = re.match(pattern, datetime_str)
|
|
24
|
+
|
|
25
|
+
if match:
|
|
26
|
+
base_time, microseconds, timezone_part = match.groups()
|
|
27
|
+
# Pad microseconds to 6 digits or truncate if longer
|
|
28
|
+
microseconds = microseconds.ljust(6, '0')[:6]
|
|
29
|
+
datetime_str = f"{base_time}.{microseconds}{timezone_part}"
|
|
30
|
+
|
|
31
|
+
return datetime.fromisoformat(datetime_str)
|
|
4
32
|
|
|
5
33
|
|
|
6
34
|
@dataclass
|
|
@@ -78,8 +106,8 @@ class JobItem:
|
|
|
78
106
|
self.page_status_code: int = data["page_status_code"]
|
|
79
107
|
self.status: str = data["status"]
|
|
80
108
|
self.title: str = data["title"]
|
|
81
|
-
self.created_at: datetime =
|
|
82
|
-
self.updated_at: datetime =
|
|
109
|
+
self.created_at: datetime = parse_datetime(data["created_at"])
|
|
110
|
+
self.updated_at: datetime = parse_datetime(data["updated_at"])
|
|
83
111
|
self.cost: int = data.get("cost", 0)
|
|
84
112
|
self.referred_url: Optional[str] = data.get("referred_url")
|
|
85
113
|
self.last_error: Optional[str] = data.get("last_error")
|
|
@@ -157,17 +185,17 @@ class Job:
|
|
|
157
185
|
self.scrape_type: str = data["scrape_type"]
|
|
158
186
|
self.whitelist_regexp: Optional[str] = data.get("whitelist_regexp")
|
|
159
187
|
self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
|
|
160
|
-
self.allow_subdomains: bool = data
|
|
188
|
+
self.allow_subdomains: bool = data.get("allow_subdomains", False)
|
|
161
189
|
self.items_limit: int = data["items_limit"]
|
|
162
|
-
self.created_at: datetime =
|
|
163
|
-
self.updated_at: datetime =
|
|
190
|
+
self.created_at: datetime = parse_datetime(data["created_at"])
|
|
191
|
+
self.updated_at: datetime = parse_datetime(data["updated_at"])
|
|
164
192
|
self.webhook_url: Optional[str] = data.get("webhook_url")
|
|
165
193
|
self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
|
|
166
194
|
|
|
167
195
|
# Optional fields
|
|
168
196
|
self.finished_at: Optional[datetime] = None
|
|
169
197
|
if data.get("finished_at"):
|
|
170
|
-
self.finished_at =
|
|
198
|
+
self.finished_at = parse_datetime(data["finished_at"])
|
|
171
199
|
|
|
172
200
|
self.webhook_status: Optional[str] = data.get("webhook_status")
|
|
173
201
|
self.webhook_error: Optional[str] = data.get("webhook_error")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: webcrawlerapi
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.5
|
|
4
4
|
Summary: Python SDK for WebCrawler API
|
|
5
5
|
Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
|
|
6
6
|
Author: Andrew
|
|
@@ -101,11 +101,11 @@ print(f"Cancellation response: {cancel_response['message']}")
|
|
|
101
101
|
```
|
|
102
102
|
|
|
103
103
|
### Scraping
|
|
104
|
-
|
|
104
|
+
Check a working code example of [scraping](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping) and [scraping with a prompt](https://github.com/WebCrawlerAPI/webcrawlerapi-examples/tree/master/python/scraping_prompt)
|
|
105
105
|
```python
|
|
106
106
|
# Returns structured data directly
|
|
107
107
|
response = crawler.scrape(
|
|
108
|
-
|
|
108
|
+
url="https://webcrawlerapi.com"
|
|
109
109
|
)
|
|
110
110
|
if response.success:
|
|
111
111
|
print(response.markdown)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|