webcrawlerapi 2.0.7__tar.gz → 2.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.7
3
+ Version: 2.0.8
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="webcrawlerapi",
5
- version="2.0.7",
5
+ version="2.0.8",
6
6
  packages=find_packages(),
7
7
  install_requires=[
8
8
  "requests>=2.25.0",
@@ -50,6 +50,7 @@ class WebCrawlerAPI:
50
50
  actions: Optional[Union[Action, List[Action]]] = None,
51
51
  respect_robots_txt: bool = False,
52
52
  main_content_only: bool = False,
53
+ max_depth: Optional[int] = None,
53
54
  ) -> CrawlResponse:
54
55
  """
55
56
  Start a new crawling job asynchronously.
@@ -65,6 +66,7 @@ class WebCrawlerAPI:
65
66
  actions (Action or List[Action], optional): Actions to perform during crawling
66
67
  respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
67
68
  main_content_only (bool): Whether to extract only main content (default: False)
69
+ max_depth (int, optional): Maximum depth of crawl (0 for seed URL only, 1 for one level deep, etc.)
68
70
 
69
71
  Returns:
70
72
  CrawlResponse: Response containing the job ID
@@ -87,6 +89,8 @@ class WebCrawlerAPI:
87
89
  payload["whitelist_regexp"] = whitelist_regexp
88
90
  if blacklist_regexp:
89
91
  payload["blacklist_regexp"] = blacklist_regexp
92
+ if max_depth is not None:
93
+ payload["max_depth"] = max_depth
90
94
  if actions:
91
95
  # Convert single action to list if needed
92
96
  action_list = [actions] if not isinstance(actions, list) else actions
@@ -150,6 +154,7 @@ class WebCrawlerAPI:
150
154
  actions: Optional[Union[Action, List[Action]]] = None,
151
155
  respect_robots_txt: bool = False,
152
156
  main_content_only: bool = False,
157
+ max_depth: Optional[int] = None,
153
158
  max_polls: int = 100,
154
159
  ) -> Job:
155
160
  """
@@ -170,6 +175,7 @@ class WebCrawlerAPI:
170
175
  actions (Action or List[Action], optional): Actions to perform during crawling
171
176
  respect_robots_txt (bool): Whether to respect robots.txt file (default: False)
172
177
  main_content_only (bool): Whether to extract only main content (default: False)
178
+ max_depth (int, optional): Maximum depth of crawl (0 for seed URL only, 1 for one level deep, etc.)
173
179
  max_polls (int): Maximum number of status checks before returning (default: 100)
174
180
 
175
181
  Returns:
@@ -190,6 +196,7 @@ class WebCrawlerAPI:
190
196
  actions=actions,
191
197
  respect_robots_txt=respect_robots_txt,
192
198
  main_content_only=main_content_only,
199
+ max_depth=max_depth,
193
200
  )
194
201
 
195
202
  job_id = response.id
@@ -125,6 +125,7 @@ class JobItem:
125
125
  self.referred_url: Optional[str] = data.get("referred_url")
126
126
  self.last_error: Optional[str] = data.get("last_error")
127
127
  self.error_code: Optional[str] = data.get("error_code")
128
+ self.depth: Optional[int] = data.get("depth")
128
129
 
129
130
  # Optional content URLs based on scrape_type
130
131
  self.raw_content_url: Optional[str] = data.get("raw_content_url")
@@ -201,6 +202,7 @@ class Job:
201
202
  self.blacklist_regexp: Optional[str] = data.get("blacklist_regexp")
202
203
  self.allow_subdomains: bool = data.get("allow_subdomains", False)
203
204
  self.items_limit: int = data["items_limit"]
205
+ self.max_depth: Optional[int] = data.get("max_depth")
204
206
  self.created_at: datetime = parse_datetime(data["created_at"])
205
207
  self.updated_at: datetime = parse_datetime(data["updated_at"])
206
208
  self.webhook_url: Optional[str] = data.get("webhook_url")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webcrawlerapi
3
- Version: 2.0.7
3
+ Version: 2.0.8
4
4
  Summary: Python SDK for WebCrawler API
5
5
  Home-page: https://github.com/webcrawlerapi/webcrawlerapi-python-sdk
6
6
  Author: Andrew
File without changes
File without changes