webcrawlerapi 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webcrawlerapi-1.0.1/PKG-INFO +152 -0
- webcrawlerapi-1.0.1/README.md +139 -0
- webcrawlerapi-1.0.1/setup.cfg +4 -0
- webcrawlerapi-1.0.1/setup.py +21 -0
- webcrawlerapi-1.0.1/webcrawlerapi/__init__.py +19 -0
- webcrawlerapi-1.0.1/webcrawlerapi/client.py +251 -0
- webcrawlerapi-1.0.1/webcrawlerapi.egg-info/PKG-INFO +152 -0
- webcrawlerapi-1.0.1/webcrawlerapi.egg-info/SOURCES.txt +9 -0
- webcrawlerapi-1.0.1/webcrawlerapi.egg-info/dependency_links.txt +1 -0
- webcrawlerapi-1.0.1/webcrawlerapi.egg-info/requires.txt +1 -0
- webcrawlerapi-1.0.1/webcrawlerapi.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: webcrawlerapi
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Python SDK for WebCrawler API
|
|
5
|
+
Home-page: https://github.com/yourusername/webcrawlerapi-python
|
|
6
|
+
Author: Andrew
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.6
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: requests>=2.25.0
|
|
13
|
+
|
|
14
|
+
# WebCrawler API Python SDK
|
|
15
|
+
|
|
16
|
+
A Python SDK for interacting with the WebCrawler API.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install webcrawlerapi
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from webcrawlerapi import WebCrawlerAPI
|
|
28
|
+
|
|
29
|
+
# Initialize the client
|
|
30
|
+
crawler = WebCrawlerAPI(api_key="your_api_key")
|
|
31
|
+
|
|
32
|
+
# Synchronous crawling (blocks until completion)
|
|
33
|
+
job = crawler.crawl(
|
|
34
|
+
url="https://example.com",
|
|
35
|
+
scrape_type="markdown",
|
|
36
|
+
items_limit=10,
|
|
37
|
+
webhook_url="https://yourserver.com/webhook",
|
|
38
|
+
allow_subdomains=False,
|
|
39
|
+
max_polls=100 # Optional: maximum number of status checks
|
|
40
|
+
)
|
|
41
|
+
print(f"Job completed with status: {job.status}")
|
|
42
|
+
|
|
43
|
+
# Or use asynchronous crawling
|
|
44
|
+
response = crawler.crawl_async(
|
|
45
|
+
url="https://example.com",
|
|
46
|
+
scrape_type="markdown",
|
|
47
|
+
items_limit=10,
|
|
48
|
+
webhook_url="https://yourserver.com/webhook",
|
|
49
|
+
allow_subdomains=False
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Get the job ID from the response
|
|
53
|
+
job_id = response.id
|
|
54
|
+
print(f"Crawling job started with ID: {job_id}")
|
|
55
|
+
|
|
56
|
+
# Check job status and get results
|
|
57
|
+
job = crawler.get_job(job_id)
|
|
58
|
+
print(f"Job status: {job.status}")
|
|
59
|
+
|
|
60
|
+
# Access job details
|
|
61
|
+
print(f"Crawled URL: {job.url}")
|
|
62
|
+
print(f"Created at: {job.created_at}")
|
|
63
|
+
print(f"Number of items: {len(job.job_items)}")
|
|
64
|
+
|
|
65
|
+
# Access individual crawled items
|
|
66
|
+
for item in job.job_items:
|
|
67
|
+
print(f"Page title: {item.title}")
|
|
68
|
+
print(f"Original URL: {item.original_url}")
|
|
69
|
+
print(f"Content URL: {item.markdown_content_url}")
|
|
70
|
+
|
|
71
|
+
# Cancel a running job if needed
|
|
72
|
+
cancel_response = crawler.cancel_job(job_id)
|
|
73
|
+
print(f"Cancellation response: {cancel_response['message']}")
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## API Methods
|
|
77
|
+
|
|
78
|
+
### crawl()
|
|
79
|
+
Starts a new crawling job and waits for its completion. This method will continuously poll the job status until:
|
|
80
|
+
- The job reaches a terminal state (done, error, or cancelled)
|
|
81
|
+
- The maximum number of polls is reached (default: 100)
|
|
82
|
+
- The polling interval is determined by the server's `recommended_pull_delay_ms` or defaults to 5 seconds
|
|
83
|
+
|
|
84
|
+
### crawl_async()
|
|
85
|
+
Starts a new crawling job and returns immediately with a job ID. Use this when you want to handle polling and status checks yourself, or when using webhooks.
|
|
86
|
+
|
|
87
|
+
### get_job()
|
|
88
|
+
Retrieves the current status and details of a specific job.
|
|
89
|
+
|
|
90
|
+
### cancel_job()
|
|
91
|
+
Cancels a running job. Any items that are not in progress or already completed will be marked as canceled and will not be charged.
|
|
92
|
+
|
|
93
|
+
## Parameters
|
|
94
|
+
|
|
95
|
+
### Crawl Methods (crawl and crawl_async)
|
|
96
|
+
- `url` (required): The seed URL where the crawler starts. Can be any valid URL.
|
|
97
|
+
- `scrape_type` (default: "html"): The type of scraping you want to perform. Can be "html", "cleaned", or "markdown".
|
|
98
|
+
- `items_limit` (default: 10): Crawler will stop when it reaches this limit of pages for this job.
|
|
99
|
+
- `webhook_url` (optional): The URL where the server will send a POST request once the task is completed.
|
|
100
|
+
- `allow_subdomains` (default: False): If True, the crawler will also crawl subdomains.
|
|
101
|
+
- `whitelist_regexp` (optional): A regular expression to whitelist URLs. Only URLs that match the pattern will be crawled.
|
|
102
|
+
- `blacklist_regexp` (optional): A regular expression to blacklist URLs. URLs that match the pattern will be skipped.
|
|
103
|
+
- `max_polls` (optional, crawl only): Maximum number of status checks before returning (default: 100)
|
|
104
|
+
|
|
105
|
+
### Responses
|
|
106
|
+
|
|
107
|
+
#### CrawlAsync Response
|
|
108
|
+
The `crawl_async()` method returns a `CrawlResponse` object with:
|
|
109
|
+
- `id`: The unique identifier of the created job
|
|
110
|
+
|
|
111
|
+
#### Job Response
|
|
112
|
+
The Job object contains detailed information about the crawling job:
|
|
113
|
+
|
|
114
|
+
- `id`: The unique identifier of the job
|
|
115
|
+
- `org_id`: Your organization identifier
|
|
116
|
+
- `url`: The seed URL where the crawler started
|
|
117
|
+
- `status`: The status of the job (new, in_progress, done, error)
|
|
118
|
+
- `scrape_type`: The type of scraping performed
|
|
119
|
+
- `created_at`: The date when the job was created
|
|
120
|
+
- `finished_at`: The date when the job was finished (if completed)
|
|
121
|
+
- `webhook_url`: The webhook URL for notifications
|
|
122
|
+
- `webhook_status`: The status of the webhook request
|
|
123
|
+
- `webhook_error`: Any error message if the webhook request failed
|
|
124
|
+
- `job_items`: List of JobItem objects representing crawled pages
|
|
125
|
+
- `recommended_pull_delay_ms`: Server-recommended delay between status checks
|
|
126
|
+
|
|
127
|
+
### JobItem Properties
|
|
128
|
+
|
|
129
|
+
Each JobItem object represents a crawled page and contains:
|
|
130
|
+
|
|
131
|
+
- `id`: The unique identifier of the item
|
|
132
|
+
- `job_id`: The parent job identifier
|
|
133
|
+
- `original_url`: The URL of the page
|
|
134
|
+
- `page_status_code`: The HTTP status code of the page request
|
|
135
|
+
- `status`: The status of the item (new, in_progress, done, error)
|
|
136
|
+
- `title`: The page title
|
|
137
|
+
- `created_at`: The date when the item was created
|
|
138
|
+
- `cost`: The cost of the item in $
|
|
139
|
+
- `referred_url`: The URL where the page was referred from
|
|
140
|
+
- `last_error`: Any error message if the item failed
|
|
141
|
+
- `raw_content_url`: URL to the raw content (if available)
|
|
142
|
+
- `cleaned_content_url`: URL to the cleaned content (if scrape_type is "cleaned")
|
|
143
|
+
- `markdown_content_url`: URL to the markdown content (if scrape_type is "markdown")
|
|
144
|
+
|
|
145
|
+
## Requirements
|
|
146
|
+
|
|
147
|
+
- Python 3.6+
|
|
148
|
+
- requests>=2.25.0
|
|
149
|
+
|
|
150
|
+
## License
|
|
151
|
+
|
|
152
|
+
MIT License
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# WebCrawler API Python SDK
|
|
2
|
+
|
|
3
|
+
A Python SDK for interacting with the WebCrawler API.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install webcrawlerapi
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from webcrawlerapi import WebCrawlerAPI
|
|
15
|
+
|
|
16
|
+
# Initialize the client
|
|
17
|
+
crawler = WebCrawlerAPI(api_key="your_api_key")
|
|
18
|
+
|
|
19
|
+
# Synchronous crawling (blocks until completion)
|
|
20
|
+
job = crawler.crawl(
|
|
21
|
+
url="https://example.com",
|
|
22
|
+
scrape_type="markdown",
|
|
23
|
+
items_limit=10,
|
|
24
|
+
webhook_url="https://yourserver.com/webhook",
|
|
25
|
+
allow_subdomains=False,
|
|
26
|
+
max_polls=100 # Optional: maximum number of status checks
|
|
27
|
+
)
|
|
28
|
+
print(f"Job completed with status: {job.status}")
|
|
29
|
+
|
|
30
|
+
# Or use asynchronous crawling
|
|
31
|
+
response = crawler.crawl_async(
|
|
32
|
+
url="https://example.com",
|
|
33
|
+
scrape_type="markdown",
|
|
34
|
+
items_limit=10,
|
|
35
|
+
webhook_url="https://yourserver.com/webhook",
|
|
36
|
+
allow_subdomains=False
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Get the job ID from the response
|
|
40
|
+
job_id = response.id
|
|
41
|
+
print(f"Crawling job started with ID: {job_id}")
|
|
42
|
+
|
|
43
|
+
# Check job status and get results
|
|
44
|
+
job = crawler.get_job(job_id)
|
|
45
|
+
print(f"Job status: {job.status}")
|
|
46
|
+
|
|
47
|
+
# Access job details
|
|
48
|
+
print(f"Crawled URL: {job.url}")
|
|
49
|
+
print(f"Created at: {job.created_at}")
|
|
50
|
+
print(f"Number of items: {len(job.job_items)}")
|
|
51
|
+
|
|
52
|
+
# Access individual crawled items
|
|
53
|
+
for item in job.job_items:
|
|
54
|
+
print(f"Page title: {item.title}")
|
|
55
|
+
print(f"Original URL: {item.original_url}")
|
|
56
|
+
print(f"Content URL: {item.markdown_content_url}")
|
|
57
|
+
|
|
58
|
+
# Cancel a running job if needed
|
|
59
|
+
cancel_response = crawler.cancel_job(job_id)
|
|
60
|
+
print(f"Cancellation response: {cancel_response['message']}")
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## API Methods
|
|
64
|
+
|
|
65
|
+
### crawl()
|
|
66
|
+
Starts a new crawling job and waits for its completion. This method will continuously poll the job status until:
|
|
67
|
+
- The job reaches a terminal state (done, error, or cancelled)
|
|
68
|
+
- The maximum number of polls is reached (default: 100)
|
|
69
|
+
- The polling interval is determined by the server's `recommended_pull_delay_ms` or defaults to 5 seconds
|
|
70
|
+
|
|
71
|
+
### crawl_async()
|
|
72
|
+
Starts a new crawling job and returns immediately with a job ID. Use this when you want to handle polling and status checks yourself, or when using webhooks.
|
|
73
|
+
|
|
74
|
+
### get_job()
|
|
75
|
+
Retrieves the current status and details of a specific job.
|
|
76
|
+
|
|
77
|
+
### cancel_job()
|
|
78
|
+
Cancels a running job. Any items that are not in progress or already completed will be marked as canceled and will not be charged.
|
|
79
|
+
|
|
80
|
+
## Parameters
|
|
81
|
+
|
|
82
|
+
### Crawl Methods (crawl and crawl_async)
|
|
83
|
+
- `url` (required): The seed URL where the crawler starts. Can be any valid URL.
|
|
84
|
+
- `scrape_type` (default: "html"): The type of scraping you want to perform. Can be "html", "cleaned", or "markdown".
|
|
85
|
+
- `items_limit` (default: 10): Crawler will stop when it reaches this limit of pages for this job.
|
|
86
|
+
- `webhook_url` (optional): The URL where the server will send a POST request once the task is completed.
|
|
87
|
+
- `allow_subdomains` (default: False): If True, the crawler will also crawl subdomains.
|
|
88
|
+
- `whitelist_regexp` (optional): A regular expression to whitelist URLs. Only URLs that match the pattern will be crawled.
|
|
89
|
+
- `blacklist_regexp` (optional): A regular expression to blacklist URLs. URLs that match the pattern will be skipped.
|
|
90
|
+
- `max_polls` (optional, crawl only): Maximum number of status checks before returning (default: 100)
|
|
91
|
+
|
|
92
|
+
### Responses
|
|
93
|
+
|
|
94
|
+
#### CrawlAsync Response
|
|
95
|
+
The `crawl_async()` method returns a `CrawlResponse` object with:
|
|
96
|
+
- `id`: The unique identifier of the created job
|
|
97
|
+
|
|
98
|
+
#### Job Response
|
|
99
|
+
The Job object contains detailed information about the crawling job:
|
|
100
|
+
|
|
101
|
+
- `id`: The unique identifier of the job
|
|
102
|
+
- `org_id`: Your organization identifier
|
|
103
|
+
- `url`: The seed URL where the crawler started
|
|
104
|
+
- `status`: The status of the job (new, in_progress, done, error)
|
|
105
|
+
- `scrape_type`: The type of scraping performed
|
|
106
|
+
- `created_at`: The date when the job was created
|
|
107
|
+
- `finished_at`: The date when the job was finished (if completed)
|
|
108
|
+
- `webhook_url`: The webhook URL for notifications
|
|
109
|
+
- `webhook_status`: The status of the webhook request
|
|
110
|
+
- `webhook_error`: Any error message if the webhook request failed
|
|
111
|
+
- `job_items`: List of JobItem objects representing crawled pages
|
|
112
|
+
- `recommended_pull_delay_ms`: Server-recommended delay between status checks
|
|
113
|
+
|
|
114
|
+
### JobItem Properties
|
|
115
|
+
|
|
116
|
+
Each JobItem object represents a crawled page and contains:
|
|
117
|
+
|
|
118
|
+
- `id`: The unique identifier of the item
|
|
119
|
+
- `job_id`: The parent job identifier
|
|
120
|
+
- `original_url`: The URL of the page
|
|
121
|
+
- `page_status_code`: The HTTP status code of the page request
|
|
122
|
+
- `status`: The status of the item (new, in_progress, done, error)
|
|
123
|
+
- `title`: The page title
|
|
124
|
+
- `created_at`: The date when the item was created
|
|
125
|
+
- `cost`: The cost of the item in $
|
|
126
|
+
- `referred_url`: The URL where the page was referred from
|
|
127
|
+
- `last_error`: Any error message if the item failed
|
|
128
|
+
- `raw_content_url`: URL to the raw content (if available)
|
|
129
|
+
- `cleaned_content_url`: URL to the cleaned content (if scrape_type is "cleaned")
|
|
130
|
+
- `markdown_content_url`: URL to the markdown content (if scrape_type is "markdown")
|
|
131
|
+
|
|
132
|
+
## Requirements
|
|
133
|
+
|
|
134
|
+
- Python 3.6+
|
|
135
|
+
- requests>=2.25.0
|
|
136
|
+
|
|
137
|
+
## License
|
|
138
|
+
|
|
139
|
+
MIT License
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="webcrawlerapi",
|
|
5
|
+
version="1.0.1",
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
install_requires=[
|
|
8
|
+
"requests>=2.25.0",
|
|
9
|
+
],
|
|
10
|
+
author="Andrew",
|
|
11
|
+
description="Python SDK for WebCrawler API",
|
|
12
|
+
long_description=open("README.md").read(),
|
|
13
|
+
long_description_content_type="text/markdown",
|
|
14
|
+
url="https://github.com/yourusername/webcrawlerapi-python",
|
|
15
|
+
classifiers=[
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
],
|
|
20
|
+
python_requires=">=3.6",
|
|
21
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
WebCrawler API Python SDK
|
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~
|
|
4
|
+
|
|
5
|
+
A Python SDK for interacting with the WebCrawler API.
|
|
6
|
+
|
|
7
|
+
Basic usage:
|
|
8
|
+
|
|
9
|
+
>>> from webcrawlerapi import WebCrawlerAPI
|
|
10
|
+
>>> crawler = WebCrawlerAPI(api_key="your_api_key")
|
|
11
|
+
>>> response = crawler.crawl(url="https://example.com")
|
|
12
|
+
>>> job_id = response["job_id"]
|
|
13
|
+
>>> job = crawler.get_job(job_id)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from .client import WebCrawlerAPI, Job, JobItem
|
|
17
|
+
|
|
18
|
+
__version__ = "1.0.0"
|
|
19
|
+
__all__ = ["WebCrawlerAPI", "Job", "JobItem"]
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from typing import Optional, Dict, Any, List
|
|
3
|
+
from urllib.parse import urljoin
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class CrawlResponse:
|
|
11
|
+
"""Response from an asynchronous crawl request."""
|
|
12
|
+
id: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class JobItem:
|
|
16
|
+
"""Represents a single crawled page item in a job."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, data: Dict[str, Any]):
|
|
19
|
+
self.id: str = data["id"]
|
|
20
|
+
self.job_id: str = data["job_id"]
|
|
21
|
+
self.original_url: str = data["original_url"]
|
|
22
|
+
self.page_status_code: int = data["page_status_code"]
|
|
23
|
+
self.status: str = data["status"]
|
|
24
|
+
self.title: str = data["title"]
|
|
25
|
+
self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
|
|
26
|
+
self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
|
|
27
|
+
self.cost: int = data["cost"]
|
|
28
|
+
self.referred_url: str = data["referred_url"]
|
|
29
|
+
self.last_error: str = data["last_error"]
|
|
30
|
+
|
|
31
|
+
# Optional content URLs based on scrape_type
|
|
32
|
+
self.raw_content_url: Optional[str] = data.get("raw_content_url")
|
|
33
|
+
self.cleaned_content_url: Optional[str] = data.get("cleaned_content_url")
|
|
34
|
+
self.markdown_content_url: Optional[str] = data.get("markdown_content_url")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Job:
|
|
38
|
+
"""Represents a crawling job."""
|
|
39
|
+
|
|
40
|
+
TERMINAL_STATUSES = {"done", "error", "cancelled"}
|
|
41
|
+
|
|
42
|
+
def __init__(self, data: Dict[str, Any]):
|
|
43
|
+
self.id: str = data["id"]
|
|
44
|
+
self.org_id: str = data["org_id"]
|
|
45
|
+
self.url: str = data["url"]
|
|
46
|
+
self.status: str = data["status"]
|
|
47
|
+
self.scrape_type: str = data["scrape_type"]
|
|
48
|
+
self.whitelist_regexp: str = data["whitelist_regexp"]
|
|
49
|
+
self.blacklist_regexp: str = data["blacklist_regexp"]
|
|
50
|
+
self.allow_subdomains: bool = data["allow_subdomains"]
|
|
51
|
+
self.items_limit: int = data["items_limit"]
|
|
52
|
+
self.created_at: datetime = datetime.fromisoformat(data["created_at"].replace('Z', '+00:00'))
|
|
53
|
+
self.updated_at: datetime = datetime.fromisoformat(data["updated_at"].replace('Z', '+00:00'))
|
|
54
|
+
self.webhook_url: str = data["webhook_url"]
|
|
55
|
+
self.recommended_pull_delay_ms: int = data.get("recommended_pull_delay_ms", 0)
|
|
56
|
+
|
|
57
|
+
# Optional fields
|
|
58
|
+
self.finished_at: Optional[datetime] = None
|
|
59
|
+
if data.get("finished_at"):
|
|
60
|
+
self.finished_at = datetime.fromisoformat(data["finished_at"].replace('Z', '+00:00'))
|
|
61
|
+
|
|
62
|
+
self.webhook_status: Optional[str] = data.get("webhook_status")
|
|
63
|
+
self.webhook_error: Optional[str] = data.get("webhook_error")
|
|
64
|
+
|
|
65
|
+
# Parse job items
|
|
66
|
+
self.job_items: List[JobItem] = [JobItem(item) for item in data.get("job_items", [])]
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def is_terminal(self) -> bool:
|
|
70
|
+
"""Check if the job is in a terminal state (done, error, or cancelled)."""
|
|
71
|
+
return self.status in self.TERMINAL_STATUSES
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class WebCrawlerAPI:
|
|
75
|
+
"""Python SDK for WebCrawler API."""
|
|
76
|
+
|
|
77
|
+
DEFAULT_POLL_DELAY_SECONDS = 5
|
|
78
|
+
|
|
79
|
+
def __init__(self, api_key: str, base_url: str = "https://api.webcrawlerapi.com", version: str = "v1"):
|
|
80
|
+
"""
|
|
81
|
+
Initialize the WebCrawler API client.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
api_key (str): Your API key for authentication
|
|
85
|
+
base_url (str): The base URL of the API (optional)
|
|
86
|
+
version (str): API version to use (optional, defaults to 'v1')
|
|
87
|
+
"""
|
|
88
|
+
self.api_key = api_key
|
|
89
|
+
self.base_url = base_url.rstrip('/')
|
|
90
|
+
self.version = version
|
|
91
|
+
self.session = requests.Session()
|
|
92
|
+
self.session.headers.update({
|
|
93
|
+
'Authorization': f'Bearer {api_key}',
|
|
94
|
+
'Content-Type': 'application/json'
|
|
95
|
+
})
|
|
96
|
+
|
|
97
|
+
def crawl_async(
|
|
98
|
+
self,
|
|
99
|
+
url: str,
|
|
100
|
+
scrape_type: str = "html",
|
|
101
|
+
items_limit: int = 10,
|
|
102
|
+
webhook_url: Optional[str] = None,
|
|
103
|
+
allow_subdomains: bool = False,
|
|
104
|
+
whitelist_regexp: Optional[str] = None,
|
|
105
|
+
blacklist_regexp: Optional[str] = None
|
|
106
|
+
) -> CrawlResponse:
|
|
107
|
+
"""
|
|
108
|
+
Start a new crawling job asynchronously.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
url (str): The seed URL where the crawler starts
|
|
112
|
+
scrape_type (str): Type of scraping (html, cleaned, markdown)
|
|
113
|
+
items_limit (int): Maximum number of pages to crawl
|
|
114
|
+
webhook_url (str, optional): URL for webhook notifications
|
|
115
|
+
allow_subdomains (bool): Whether to crawl subdomains
|
|
116
|
+
whitelist_regexp (str, optional): Regex pattern for URL whitelist
|
|
117
|
+
blacklist_regexp (str, optional): Regex pattern for URL blacklist
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
CrawlResponse: Response containing the job ID
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
requests.exceptions.RequestException: If the API request fails
|
|
124
|
+
"""
|
|
125
|
+
payload = {
|
|
126
|
+
"url": url,
|
|
127
|
+
"scrape_type": scrape_type,
|
|
128
|
+
"items_limit": items_limit,
|
|
129
|
+
"allow_subdomains": allow_subdomains
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if webhook_url:
|
|
133
|
+
payload["webhook_url"] = webhook_url
|
|
134
|
+
if whitelist_regexp:
|
|
135
|
+
payload["whitelist_regexp"] = whitelist_regexp
|
|
136
|
+
if blacklist_regexp:
|
|
137
|
+
payload["blacklist_regexp"] = blacklist_regexp
|
|
138
|
+
|
|
139
|
+
response = self.session.post(
|
|
140
|
+
urljoin(self.base_url, f"/{self.version}/crawl"),
|
|
141
|
+
json=payload
|
|
142
|
+
)
|
|
143
|
+
response.raise_for_status()
|
|
144
|
+
return CrawlResponse(id=response.json()["job_id"])
|
|
145
|
+
|
|
146
|
+
def get_job(self, job_id: str) -> Job:
|
|
147
|
+
"""
|
|
148
|
+
Get the status and details of a specific job.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
job_id (str): The unique identifier of the job
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Job: A Job object containing all job details and items
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
requests.exceptions.RequestException: If the API request fails
|
|
158
|
+
"""
|
|
159
|
+
response = self.session.get(
|
|
160
|
+
urljoin(self.base_url, f"/{self.version}/job/{job_id}")
|
|
161
|
+
)
|
|
162
|
+
response.raise_for_status()
|
|
163
|
+
return Job(response.json())
|
|
164
|
+
|
|
165
|
+
def cancel_job(self, job_id: str) -> Dict[str, str]:
|
|
166
|
+
"""
|
|
167
|
+
Cancel a running job. All items that are not in progress and not done
|
|
168
|
+
will be marked as canceled and will not be charged.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
job_id (str): The unique identifier of the job to cancel
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
dict: Response containing confirmation message
|
|
175
|
+
|
|
176
|
+
Raises:
|
|
177
|
+
requests.exceptions.RequestException: If the API request fails
|
|
178
|
+
"""
|
|
179
|
+
response = self.session.put(
|
|
180
|
+
urljoin(self.base_url, f"/{self.version}/job/{job_id}/cancel")
|
|
181
|
+
)
|
|
182
|
+
response.raise_for_status()
|
|
183
|
+
return response.json()
|
|
184
|
+
|
|
185
|
+
def crawl(
|
|
186
|
+
self,
|
|
187
|
+
url: str,
|
|
188
|
+
scrape_type: str = "html",
|
|
189
|
+
items_limit: int = 10,
|
|
190
|
+
webhook_url: Optional[str] = None,
|
|
191
|
+
allow_subdomains: bool = False,
|
|
192
|
+
whitelist_regexp: Optional[str] = None,
|
|
193
|
+
blacklist_regexp: Optional[str] = None,
|
|
194
|
+
max_polls: int = 100
|
|
195
|
+
) -> Job:
|
|
196
|
+
"""
|
|
197
|
+
Start a new crawling job and wait for its completion.
|
|
198
|
+
|
|
199
|
+
This method will start a crawling job and continuously poll its status
|
|
200
|
+
until it reaches a terminal state (done, error, or cancelled) or until
|
|
201
|
+
the maximum number of polls is reached.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
url (str): The seed URL where the crawler starts
|
|
205
|
+
scrape_type (str): Type of scraping (html, cleaned, markdown)
|
|
206
|
+
items_limit (int): Maximum number of pages to crawl
|
|
207
|
+
webhook_url (str, optional): URL for webhook notifications
|
|
208
|
+
allow_subdomains (bool): Whether to crawl subdomains
|
|
209
|
+
whitelist_regexp (str, optional): Regex pattern for URL whitelist
|
|
210
|
+
blacklist_regexp (str, optional): Regex pattern for URL blacklist
|
|
211
|
+
max_polls (int): Maximum number of status checks before returning (default: 100)
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Job: The final job state after completion or max polls
|
|
215
|
+
|
|
216
|
+
Raises:
|
|
217
|
+
requests.exceptions.RequestException: If any API request fails
|
|
218
|
+
"""
|
|
219
|
+
# Start the crawling job
|
|
220
|
+
response = self.crawl_async(
|
|
221
|
+
url=url,
|
|
222
|
+
scrape_type=scrape_type,
|
|
223
|
+
items_limit=items_limit,
|
|
224
|
+
webhook_url=webhook_url,
|
|
225
|
+
allow_subdomains=allow_subdomains,
|
|
226
|
+
whitelist_regexp=whitelist_regexp,
|
|
227
|
+
blacklist_regexp=blacklist_regexp
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
job_id = response.id
|
|
231
|
+
polls = 0
|
|
232
|
+
|
|
233
|
+
while polls < max_polls:
|
|
234
|
+
job = self.get_job(job_id)
|
|
235
|
+
|
|
236
|
+
# Return immediately if job is in a terminal state
|
|
237
|
+
if job.is_terminal:
|
|
238
|
+
return job
|
|
239
|
+
|
|
240
|
+
# Calculate delay for next poll
|
|
241
|
+
delay_seconds = (
|
|
242
|
+
job.recommended_pull_delay_ms / 1000
|
|
243
|
+
if job.recommended_pull_delay_ms
|
|
244
|
+
else self.DEFAULT_POLL_DELAY_SECONDS
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
time.sleep(delay_seconds)
|
|
248
|
+
polls += 1
|
|
249
|
+
|
|
250
|
+
# Return the last known state if max_polls is reached
|
|
251
|
+
return job
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: webcrawlerapi
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Python SDK for WebCrawler API
|
|
5
|
+
Home-page: https://github.com/yourusername/webcrawlerapi-python
|
|
6
|
+
Author: Andrew
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.6
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: requests>=2.25.0
|
|
13
|
+
|
|
14
|
+
# WebCrawler API Python SDK
|
|
15
|
+
|
|
16
|
+
A Python SDK for interacting with the WebCrawler API.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install webcrawlerapi
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from webcrawlerapi import WebCrawlerAPI
|
|
28
|
+
|
|
29
|
+
# Initialize the client
|
|
30
|
+
crawler = WebCrawlerAPI(api_key="your_api_key")
|
|
31
|
+
|
|
32
|
+
# Synchronous crawling (blocks until completion)
|
|
33
|
+
job = crawler.crawl(
|
|
34
|
+
url="https://example.com",
|
|
35
|
+
scrape_type="markdown",
|
|
36
|
+
items_limit=10,
|
|
37
|
+
webhook_url="https://yourserver.com/webhook",
|
|
38
|
+
allow_subdomains=False,
|
|
39
|
+
max_polls=100 # Optional: maximum number of status checks
|
|
40
|
+
)
|
|
41
|
+
print(f"Job completed with status: {job.status}")
|
|
42
|
+
|
|
43
|
+
# Or use asynchronous crawling
|
|
44
|
+
response = crawler.crawl_async(
|
|
45
|
+
url="https://example.com",
|
|
46
|
+
scrape_type="markdown",
|
|
47
|
+
items_limit=10,
|
|
48
|
+
webhook_url="https://yourserver.com/webhook",
|
|
49
|
+
allow_subdomains=False
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Get the job ID from the response
|
|
53
|
+
job_id = response.id
|
|
54
|
+
print(f"Crawling job started with ID: {job_id}")
|
|
55
|
+
|
|
56
|
+
# Check job status and get results
|
|
57
|
+
job = crawler.get_job(job_id)
|
|
58
|
+
print(f"Job status: {job.status}")
|
|
59
|
+
|
|
60
|
+
# Access job details
|
|
61
|
+
print(f"Crawled URL: {job.url}")
|
|
62
|
+
print(f"Created at: {job.created_at}")
|
|
63
|
+
print(f"Number of items: {len(job.job_items)}")
|
|
64
|
+
|
|
65
|
+
# Access individual crawled items
|
|
66
|
+
for item in job.job_items:
|
|
67
|
+
print(f"Page title: {item.title}")
|
|
68
|
+
print(f"Original URL: {item.original_url}")
|
|
69
|
+
print(f"Content URL: {item.markdown_content_url}")
|
|
70
|
+
|
|
71
|
+
# Cancel a running job if needed
|
|
72
|
+
cancel_response = crawler.cancel_job(job_id)
|
|
73
|
+
print(f"Cancellation response: {cancel_response['message']}")
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## API Methods
|
|
77
|
+
|
|
78
|
+
### crawl()
|
|
79
|
+
Starts a new crawling job and waits for its completion. This method will continuously poll the job status until:
|
|
80
|
+
- The job reaches a terminal state (done, error, or cancelled)
|
|
81
|
+
- The maximum number of polls is reached (default: 100)
|
|
82
|
+
- The polling interval is determined by the server's `recommended_pull_delay_ms` or defaults to 5 seconds
|
|
83
|
+
|
|
84
|
+
### crawl_async()
|
|
85
|
+
Starts a new crawling job and returns immediately with a job ID. Use this when you want to handle polling and status checks yourself, or when using webhooks.
|
|
86
|
+
|
|
87
|
+
### get_job()
|
|
88
|
+
Retrieves the current status and details of a specific job.
|
|
89
|
+
|
|
90
|
+
### cancel_job()
|
|
91
|
+
Cancels a running job. Any items that are not in progress or already completed will be marked as canceled and will not be charged.
|
|
92
|
+
|
|
93
|
+
## Parameters
|
|
94
|
+
|
|
95
|
+
### Crawl Methods (crawl and crawl_async)
|
|
96
|
+
- `url` (required): The seed URL where the crawler starts. Can be any valid URL.
|
|
97
|
+
- `scrape_type` (default: "html"): The type of scraping you want to perform. Can be "html", "cleaned", or "markdown".
|
|
98
|
+
- `items_limit` (default: 10): Crawler will stop when it reaches this limit of pages for this job.
|
|
99
|
+
- `webhook_url` (optional): The URL where the server will send a POST request once the task is completed.
|
|
100
|
+
- `allow_subdomains` (default: False): If True, the crawler will also crawl subdomains.
|
|
101
|
+
- `whitelist_regexp` (optional): A regular expression to whitelist URLs. Only URLs that match the pattern will be crawled.
|
|
102
|
+
- `blacklist_regexp` (optional): A regular expression to blacklist URLs. URLs that match the pattern will be skipped.
|
|
103
|
+
- `max_polls` (optional, crawl only): Maximum number of status checks before returning (default: 100)
|
|
104
|
+
|
|
105
|
+
### Responses
|
|
106
|
+
|
|
107
|
+
#### CrawlAsync Response
|
|
108
|
+
The `crawl_async()` method returns a `CrawlResponse` object with:
|
|
109
|
+
- `id`: The unique identifier of the created job
|
|
110
|
+
|
|
111
|
+
#### Job Response
|
|
112
|
+
The Job object contains detailed information about the crawling job:
|
|
113
|
+
|
|
114
|
+
- `id`: The unique identifier of the job
|
|
115
|
+
- `org_id`: Your organization identifier
|
|
116
|
+
- `url`: The seed URL where the crawler started
|
|
117
|
+
- `status`: The status of the job (new, in_progress, done, error)
|
|
118
|
+
- `scrape_type`: The type of scraping performed
|
|
119
|
+
- `created_at`: The date when the job was created
|
|
120
|
+
- `finished_at`: The date when the job was finished (if completed)
|
|
121
|
+
- `webhook_url`: The webhook URL for notifications
|
|
122
|
+
- `webhook_status`: The status of the webhook request
|
|
123
|
+
- `webhook_error`: Any error message if the webhook request failed
|
|
124
|
+
- `job_items`: List of JobItem objects representing crawled pages
|
|
125
|
+
- `recommended_pull_delay_ms`: Server-recommended delay between status checks
|
|
126
|
+
|
|
127
|
+
### JobItem Properties
|
|
128
|
+
|
|
129
|
+
Each JobItem object represents a crawled page and contains:
|
|
130
|
+
|
|
131
|
+
- `id`: The unique identifier of the item
|
|
132
|
+
- `job_id`: The parent job identifier
|
|
133
|
+
- `original_url`: The URL of the page
|
|
134
|
+
- `page_status_code`: The HTTP status code of the page request
|
|
135
|
+
- `status`: The status of the item (new, in_progress, done, error)
|
|
136
|
+
- `title`: The page title
|
|
137
|
+
- `created_at`: The date when the item was created
|
|
138
|
+
- `cost`: The cost of the item in $
|
|
139
|
+
- `referred_url`: The URL where the page was referred from
|
|
140
|
+
- `last_error`: Any error message if the item failed
|
|
141
|
+
- `raw_content_url`: URL to the raw content (if available)
|
|
142
|
+
- `cleaned_content_url`: URL to the cleaned content (if scrape_type is "cleaned")
|
|
143
|
+
- `markdown_content_url`: URL to the markdown content (if scrape_type is "markdown")
|
|
144
|
+
|
|
145
|
+
## Requirements
|
|
146
|
+
|
|
147
|
+
- Python 3.6+
|
|
148
|
+
- requests>=2.25.0
|
|
149
|
+
|
|
150
|
+
## License
|
|
151
|
+
|
|
152
|
+
MIT License
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
webcrawlerapi/__init__.py
|
|
4
|
+
webcrawlerapi/client.py
|
|
5
|
+
webcrawlerapi.egg-info/PKG-INFO
|
|
6
|
+
webcrawlerapi.egg-info/SOURCES.txt
|
|
7
|
+
webcrawlerapi.egg-info/dependency_links.txt
|
|
8
|
+
webcrawlerapi.egg-info/requires.txt
|
|
9
|
+
webcrawlerapi.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
requests>=2.25.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
webcrawlerapi
|