webcrawlerapi 2.0.7__tar.gz → 2.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/PKG-INFO +1 -1
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/setup.py +1 -1
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi/client.py +7 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi/models.py +2 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi.egg-info/PKG-INFO +1 -1
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/README.md +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/pyproject.toml +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/setup.cfg +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/tests/__init__.py +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/tests/test_client.py +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/tests/test_models.py +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi/__init__.py +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi.egg-info/SOURCES.txt +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi.egg-info/dependency_links.txt +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi.egg-info/requires.txt +0 -0
- {webcrawlerapi-2.0.7 → webcrawlerapi-2.0.8}/webcrawlerapi.egg-info/top_level.txt +0 -0
|
@@ -50,6 +50,7 @@ class WebCrawlerAPI:
|
|
|
50
50
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
51
51
|
respect_robots_txt: bool = False,
|
|
52
52
|
main_content_only: bool = False,
|
|
53
|
+
max_depth: Optional[int] = None,
|
|
53
54
|
) -> CrawlResponse:
|
|
54
55
|
"""
|
|
55
56
|
Start a new crawling job asynchronously.
|
|
@@ -65,6 +66,7 @@ class WebCrawlerAPI:
|
|
|
65
66
|
actions (Action or List[Action], optional): Actions to perform during crawling
|
|
66
67
|
respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
|
|
67
68
|
main_content_only (bool): Whether to extract only main content (default: False)
|
|
69
|
+
max_depth (int, optional): Maximum depth of crawl (0 for seed URL only, 1 for one level deep, etc.)
|
|
68
70
|
|
|
69
71
|
Returns:
|
|
70
72
|
CrawlResponse: Response containing the job ID
|
|
@@ -87,6 +89,8 @@ class WebCrawlerAPI:
|
|
|
87
89
|
payload["whitelist_regexp"] = whitelist_regexp
|
|
88
90
|
if blacklist_regexp:
|
|
89
91
|
payload["blacklist_regexp"] = blacklist_regexp
|
|
92
|
+
if max_depth is not None:
|
|
93
|
+
payload["max_depth"] = max_depth
|
|
90
94
|
if actions:
|
|
91
95
|
# Convert single action to list if needed
|
|
92
96
|
action_list = [actions] if not isinstance(actions, list) else actions
|
|
@@ -150,6 +154,7 @@ class WebCrawlerAPI:
|
|
|
150
154
|
actions: Optional[Union[Action, List[Action]]] = None,
|
|
151
155
|
respect_robots_txt: bool = False,
|
|
152
156
|
main_content_only: bool = False,
|
|
157
|
+
max_depth: Optional[int] = None,
|
|
153
158
|
max_polls: int = 100,
|
|
154
159
|
) -> Job:
|
|
155
160
|
"""
|
|
@@ -170,6 +175,7 @@ class WebCrawlerAPI:
|
|
|
170
175
|
actions (Action or List[Action], optional): Actions to perform during crawling
|
|
171
176
|
respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
|
|
172
177
|
main_content_only (bool): Whether to extract only main content (default: False)
|
|
178
|
+
max_depth (int, optional): Maximum depth of crawl (0 for seed URL only, 1 for one level deep, etc.)
|
|
173
179
|
max_polls (int): Maximum number of status checks before returning (default: 100)
|
|
174
180
|
|
|
175
181
|
Returns:
|
|
@@ -190,6 +196,7 @@ class WebCrawlerAPI:
|
|
|
190
196
|
actions=actions,
|
|
191
197
|
respect_robots_txt=respect_robots_txt,
|
|
192
198
|
main_content_only=main_content_only,
|
|
199
|
+
max_depth=max_depth,
|
|
193
200
|
)
|
|
194
201
|
|
|
195
202
|
job_id = response.id
|
|
@@ -125,6 +125,7 @@ class JobItem:
|
|
|
125
125
|
self.referred_url: Optional[str] = data.get("referred_url")
|
|
126
126
|
self.last_error: Optional[str] = data.get("last_error")
|
|
127
127
|
self.error_code: Optional[str] = data.get("error_code")
|
|
128
|
+
self.depth: Optional[int] = data.get("depth")
|
|
128
129
|
|
|
129
130
|
# Optional content URLs based on scrape_type
|
|
130
131
|
self.raw_content_url: Optional[str] = data.get("raw_content_url")
|
|
@@ -201,6 +202,7 @@ class Job:
|
|
|
201
202
|
self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
|
|
202
203
|
self.allow_subdomains: bool = data.get("allow_subdomains", False)
|
|
203
204
|
self.items_limit: int = data["items_limit"]
|
|
205
|
+
self.max_depth: Optional[int] = data.get("max_depth")
|
|
204
206
|
self.created_at: datetime = parse_datetime(data["created_at"])
|
|
205
207
|
self.updated_at: datetime = parse_datetime(data["updated_at"])
|
|
206
208
|
self.webhook_url: Optional[str] = data.get("webhook_url")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|