hyperbrowser 0.23.0__tar.gz → 0.24.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hyperbrowser might be problematic. Click here for more details.

Files changed (40) hide show
  1. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/PKG-INFO +1 -1
  2. hyperbrowser-0.24.0/hyperbrowser/client/managers/async_manager/crawl.py +101 -0
  3. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/extract.py +19 -4
  4. hyperbrowser-0.24.0/hyperbrowser/client/managers/async_manager/scrape.py +150 -0
  5. hyperbrowser-0.24.0/hyperbrowser/client/managers/sync_manager/crawl.py +102 -0
  6. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/extract.py +18 -4
  7. hyperbrowser-0.24.0/hyperbrowser/client/managers/sync_manager/scrape.py +148 -0
  8. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/consts.py +2 -0
  9. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/crawl.py +2 -2
  10. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/extract.py +5 -1
  11. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/scrape.py +76 -1
  12. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/pyproject.toml +1 -1
  13. hyperbrowser-0.23.0/hyperbrowser/client/managers/async_manager/crawl.py +0 -60
  14. hyperbrowser-0.23.0/hyperbrowser/client/managers/async_manager/scrape.py +0 -36
  15. hyperbrowser-0.23.0/hyperbrowser/client/managers/sync_manager/crawl.py +0 -60
  16. hyperbrowser-0.23.0/hyperbrowser/client/managers/sync_manager/scrape.py +0 -36
  17. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/LICENSE +0 -0
  18. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/README.md +0 -0
  19. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/__init__.py +0 -0
  20. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/async_client.py +0 -0
  21. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/base.py +0 -0
  22. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/extension.py +0 -0
  23. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/profile.py +0 -0
  24. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/async_manager/session.py +0 -0
  25. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/extension.py +0 -0
  26. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/profile.py +0 -0
  27. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/managers/sync_manager/session.py +0 -0
  28. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/client/sync.py +0 -0
  29. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/config.py +0 -0
  30. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/exceptions.py +0 -0
  31. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/extension.py +0 -0
  32. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/profile.py +0 -0
  33. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/models/session.py +0 -0
  34. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/tools/__init__.py +0 -0
  35. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/tools/anthropic.py +0 -0
  36. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/tools/openai.py +0 -0
  37. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/tools/schema.py +0 -0
  38. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/transport/async_transport.py +0 -0
  39. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/transport/base.py +0 -0
  40. {hyperbrowser-0.23.0 → hyperbrowser-0.24.0}/hyperbrowser/transport/sync.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: hyperbrowser
3
- Version: 0.23.0
3
+ Version: 0.24.0
4
4
  Summary: Python SDK for hyperbrowser
5
5
  License: MIT
6
6
  Author: Nikhil Shahi
@@ -0,0 +1,101 @@
1
+ import asyncio
2
+
3
+ from hyperbrowser.models.consts import POLLING_ATTEMPTS
4
+ from ....models.crawl import (
5
+ CrawlJobResponse,
6
+ GetCrawlJobParams,
7
+ StartCrawlJobParams,
8
+ StartCrawlJobResponse,
9
+ )
10
+ from ....exceptions import HyperbrowserError
11
+
12
+
13
+ class CrawlManager:
14
+ def __init__(self, client):
15
+ self._client = client
16
+
17
+ async def start(self, params: StartCrawlJobParams) -> StartCrawlJobResponse:
18
+ response = await self._client.transport.post(
19
+ self._client._build_url("/crawl"),
20
+ data=params.model_dump(exclude_none=True, by_alias=True),
21
+ )
22
+ return StartCrawlJobResponse(**response.data)
23
+
24
+ async def get(
25
+ self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
26
+ ) -> CrawlJobResponse:
27
+ response = await self._client.transport.get(
28
+ self._client._build_url(f"/crawl/{job_id}"), params=params.__dict__
29
+ )
30
+ return CrawlJobResponse(**response.data)
31
+
32
+ async def start_and_wait(
33
+ self, params: StartCrawlJobParams, return_all_pages: bool = True
34
+ ) -> CrawlJobResponse:
35
+ job_start_resp = await self.start(params)
36
+ job_id = job_start_resp.job_id
37
+ if not job_id:
38
+ raise HyperbrowserError("Failed to start crawl job")
39
+
40
+ job_response: CrawlJobResponse
41
+ failures = 0
42
+ while True:
43
+ try:
44
+ job_response = await self.get(
45
+ job_id,
46
+ params=GetCrawlJobParams(batch_size=1),
47
+ )
48
+ if (
49
+ job_response.status == "completed"
50
+ or job_response.status == "failed"
51
+ ):
52
+ break
53
+ except Exception as e:
54
+ failures += 1
55
+ if failures >= POLLING_ATTEMPTS:
56
+ raise HyperbrowserError(
57
+ f"Failed to poll crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
58
+ )
59
+ await asyncio.sleep(2)
60
+
61
+ failures = 0
62
+ if not return_all_pages:
63
+ while True:
64
+ try:
65
+ job_response = await self.get(job_id)
66
+ return job_response
67
+ except Exception as e:
68
+ failures += 1
69
+ if failures >= POLLING_ATTEMPTS:
70
+ raise HyperbrowserError(
71
+ f"Failed to get crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
72
+ )
73
+ await asyncio.sleep(0.5)
74
+
75
+ failures = 0
76
+ job_response.current_page_batch = 0
77
+ job_response.data = []
78
+ while job_response.current_page_batch < job_response.total_page_batches:
79
+ try:
80
+ tmp_job_response = await self.get(
81
+ job_start_resp.job_id,
82
+ GetCrawlJobParams(
83
+ page=job_response.current_page_batch + 1, batch_size=100
84
+ ),
85
+ )
86
+ if tmp_job_response.data:
87
+ job_response.data.extend(tmp_job_response.data)
88
+ job_response.current_page_batch = tmp_job_response.current_page_batch
89
+ job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
90
+ job_response.total_page_batches = tmp_job_response.total_page_batches
91
+ job_response.batch_size = tmp_job_response.batch_size
92
+ failures = 0
93
+ except Exception as e:
94
+ failures += 1
95
+ if failures >= POLLING_ATTEMPTS:
96
+ raise HyperbrowserError(
97
+ f"Failed to get crawl batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
98
+ )
99
+ await asyncio.sleep(0.5)
100
+
101
+ return job_response
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  from hyperbrowser.exceptions import HyperbrowserError
3
+ from hyperbrowser.models.consts import POLLING_ATTEMPTS
3
4
  from hyperbrowser.models.extract import (
4
5
  ExtractJobResponse,
5
6
  StartExtractJobParams,
@@ -32,10 +33,24 @@ class ExtractManager:
32
33
 
33
34
  async def start_and_wait(self, params: StartExtractJobParams) -> ExtractJobResponse:
34
35
  job_start_resp = await self.start(params)
35
- if not job_start_resp.job_id:
36
+ job_id = job_start_resp.job_id
37
+ if not job_id:
36
38
  raise HyperbrowserError("Failed to start extract job")
39
+
40
+ failures = 0
37
41
  while True:
38
- job_response = await self.get(job_start_resp.job_id)
39
- if job_response.status == "completed" or job_response.status == "failed":
40
- return job_response
42
+ try:
43
+ job_response = await self.get(job_id)
44
+ if (
45
+ job_response.status == "completed"
46
+ or job_response.status == "failed"
47
+ ):
48
+ return job_response
49
+ failures = 0
50
+ except Exception as e:
51
+ failures += 1
52
+ if failures >= POLLING_ATTEMPTS:
53
+ raise HyperbrowserError(
54
+ f"Failed to poll extract job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
55
+ )
41
56
  await asyncio.sleep(2)
@@ -0,0 +1,150 @@
1
+ import asyncio
2
+ from typing import Optional
3
+
4
+ from hyperbrowser.models.consts import POLLING_ATTEMPTS
5
+ from ....models.scrape import (
6
+ BatchScrapeJobResponse,
7
+ GetBatchScrapeJobParams,
8
+ ScrapeJobResponse,
9
+ StartBatchScrapeJobParams,
10
+ StartBatchScrapeJobResponse,
11
+ StartScrapeJobParams,
12
+ StartScrapeJobResponse,
13
+ )
14
+ from ....exceptions import HyperbrowserError
15
+
16
+
17
+ class BatchScrapeManager:
18
+ def __init__(self, client):
19
+ self._client = client
20
+
21
+ async def start(
22
+ self, params: StartBatchScrapeJobParams
23
+ ) -> StartBatchScrapeJobResponse:
24
+ response = await self._client.transport.post(
25
+ self._client._build_url("/scrape/batch"),
26
+ data=params.model_dump(exclude_none=True, by_alias=True),
27
+ )
28
+ return StartBatchScrapeJobResponse(**response.data)
29
+
30
+ async def get(
31
+ self, job_id: str, params: GetBatchScrapeJobParams = GetBatchScrapeJobParams()
32
+ ) -> BatchScrapeJobResponse:
33
+ response = await self._client.transport.get(
34
+ self._client._build_url(f"/scrape/batch/{job_id}"), params=params.__dict__
35
+ )
36
+ return BatchScrapeJobResponse(**response.data)
37
+
38
+ async def start_and_wait(
39
+ self, params: StartBatchScrapeJobParams, return_all_pages: bool = True
40
+ ) -> BatchScrapeJobResponse:
41
+ job_start_resp = await self.start(params)
42
+ job_id = job_start_resp.job_id
43
+ if not job_id:
44
+ raise HyperbrowserError("Failed to start batch scrape job")
45
+
46
+ job_response: BatchScrapeJobResponse
47
+ failures = 0
48
+ while True:
49
+ try:
50
+ job_response = await self.get(
51
+ job_id, params=GetBatchScrapeJobParams(batch_size=1)
52
+ )
53
+ if (
54
+ job_response.status == "completed"
55
+ or job_response.status == "failed"
56
+ ):
57
+ break
58
+ failures = 0
59
+ except Exception as e:
60
+ failures += 1
61
+ if failures >= POLLING_ATTEMPTS:
62
+ raise HyperbrowserError(
63
+ f"Failed to poll batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
64
+ )
65
+ await asyncio.sleep(2)
66
+
67
+ failures = 0
68
+ if not return_all_pages:
69
+ while True:
70
+ try:
71
+ job_response = await self.get(job_id)
72
+ return job_response
73
+ except Exception as e:
74
+ failures += 1
75
+ if failures >= POLLING_ATTEMPTS:
76
+ raise HyperbrowserError(
77
+ f"Failed to get batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
78
+ )
79
+ await asyncio.sleep(0.5)
80
+
81
+ failures = 0
82
+ job_response.current_page_batch = 0
83
+ job_response.data = []
84
+ while job_response.current_page_batch < job_response.total_page_batches:
85
+ try:
86
+ tmp_job_response = await self.get(
87
+ job_id,
88
+ params=GetBatchScrapeJobParams(
89
+ page=job_response.current_page_batch + 1, batch_size=100
90
+ ),
91
+ )
92
+ if tmp_job_response.data:
93
+ job_response.data.extend(tmp_job_response.data)
94
+ job_response.current_page_batch = tmp_job_response.current_page_batch
95
+ job_response.total_scraped_pages = tmp_job_response.total_scraped_pages
96
+ job_response.total_page_batches = tmp_job_response.total_page_batches
97
+ job_response.batch_size = tmp_job_response.batch_size
98
+ failures = 0
99
+ except Exception as e:
100
+ failures += 1
101
+ if failures >= POLLING_ATTEMPTS:
102
+ raise HyperbrowserError(
103
+ f"Failed to get batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
104
+ )
105
+ await asyncio.sleep(0.5)
106
+
107
+ return job_response
108
+
109
+
110
+ class ScrapeManager:
111
+ def __init__(self, client):
112
+ self._client = client
113
+ self.batch = BatchScrapeManager(client)
114
+
115
+ async def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
116
+ response = await self._client.transport.post(
117
+ self._client._build_url("/scrape"),
118
+ data=params.model_dump(exclude_none=True, by_alias=True),
119
+ )
120
+ return StartScrapeJobResponse(**response.data)
121
+
122
+ async def get(self, job_id: str) -> ScrapeJobResponse:
123
+ response = await self._client.transport.get(
124
+ self._client._build_url(f"/scrape/{job_id}")
125
+ )
126
+ return ScrapeJobResponse(**response.data)
127
+
128
+ async def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
129
+ job_start_resp = await self.start(params)
130
+ job_id = job_start_resp.job_id
131
+ if not job_id:
132
+ raise HyperbrowserError("Failed to start scrape job")
133
+
134
+ failures = 0
135
+ while True:
136
+ try:
137
+ job_response = await self.get(job_id)
138
+ if (
139
+ job_response.status == "completed"
140
+ or job_response.status == "failed"
141
+ ):
142
+ return job_response
143
+ failures = 0
144
+ except Exception as e:
145
+ failures += 1
146
+ if failures >= POLLING_ATTEMPTS:
147
+ raise HyperbrowserError(
148
+ f"Failed to poll scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
149
+ )
150
+ await asyncio.sleep(2)
@@ -0,0 +1,102 @@
1
+ import time
2
+ from typing import Optional
3
+
4
+ from hyperbrowser.models.consts import POLLING_ATTEMPTS
5
+ from ....models.crawl import (
6
+ CrawlJobResponse,
7
+ GetCrawlJobParams,
8
+ StartCrawlJobParams,
9
+ StartCrawlJobResponse,
10
+ )
11
+ from ....exceptions import HyperbrowserError
12
+
13
+
14
+ class CrawlManager:
15
+ def __init__(self, client):
16
+ self._client = client
17
+
18
+ def start(self, params: StartCrawlJobParams) -> StartCrawlJobResponse:
19
+ response = self._client.transport.post(
20
+ self._client._build_url("/crawl"),
21
+ data=params.model_dump(exclude_none=True, by_alias=True),
22
+ )
23
+ return StartCrawlJobResponse(**response.data)
24
+
25
+ def get(
26
+ self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
27
+ ) -> CrawlJobResponse:
28
+ response = self._client.transport.get(
29
+ self._client._build_url(f"/crawl/{job_id}"), params=params.__dict__
30
+ )
31
+ return CrawlJobResponse(**response.data)
32
+
33
+ def start_and_wait(
34
+ self, params: StartCrawlJobParams, return_all_pages: bool = True
35
+ ) -> CrawlJobResponse:
36
+ job_start_resp = self.start(params)
37
+ job_id = job_start_resp.job_id
38
+ if not job_id:
39
+ raise HyperbrowserError("Failed to start crawl job")
40
+
41
+ job_response: CrawlJobResponse
42
+ failures = 0
43
+ while True:
44
+ try:
45
+ job_response = self.get(
46
+ job_id,
47
+ params=GetCrawlJobParams(batch_size=1),
48
+ )
49
+ if (
50
+ job_response.status == "completed"
51
+ or job_response.status == "failed"
52
+ ):
53
+ break
54
+ except Exception as e:
55
+ failures += 1
56
+ if failures >= POLLING_ATTEMPTS:
57
+ raise HyperbrowserError(
58
+ f"Failed to poll crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
59
+ )
60
+ time.sleep(2)
61
+
62
+ failures = 0
63
+ if not return_all_pages:
64
+ while True:
65
+ try:
66
+ job_response = self.get(job_id)
67
+ return job_response
68
+ except Exception as e:
69
+ failures += 1
70
+ if failures >= POLLING_ATTEMPTS:
71
+ raise HyperbrowserError(
72
+ f"Failed to get crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
73
+ )
74
+ time.sleep(0.5)
75
+
76
+ failures = 0
77
+ job_response.current_page_batch = 0
78
+ job_response.data = []
79
+ while job_response.current_page_batch < job_response.total_page_batches:
80
+ try:
81
+ tmp_job_response = self.get(
82
+ job_id,
83
+ GetCrawlJobParams(
84
+ page=job_response.current_page_batch + 1, batch_size=100
85
+ ),
86
+ )
87
+ if tmp_job_response.data:
88
+ job_response.data.extend(tmp_job_response.data)
89
+ job_response.current_page_batch = tmp_job_response.current_page_batch
90
+ job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
91
+ job_response.total_page_batches = tmp_job_response.total_page_batches
92
+ job_response.batch_size = tmp_job_response.batch_size
93
+ failures = 0
94
+ except Exception as e:
95
+ failures += 1
96
+ if failures >= POLLING_ATTEMPTS:
97
+ raise HyperbrowserError(
98
+ f"Failed to get crawl batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
99
+ )
100
+ time.sleep(0.5)
101
+
102
+ return job_response
@@ -1,5 +1,6 @@
1
1
  import time
2
2
  from hyperbrowser.exceptions import HyperbrowserError
3
+ from hyperbrowser.models.consts import POLLING_ATTEMPTS
3
4
  from hyperbrowser.models.extract import (
4
5
  ExtractJobResponse,
5
6
  StartExtractJobParams,
@@ -32,10 +33,23 @@ class ExtractManager:
32
33
 
33
34
  def start_and_wait(self, params: StartExtractJobParams) -> ExtractJobResponse:
34
35
  job_start_resp = self.start(params)
35
- if not job_start_resp.job_id:
36
+ job_id = job_start_resp.job_id
37
+ if not job_id:
36
38
  raise HyperbrowserError("Failed to start extract job")
39
+
40
+ failures = 0
37
41
  while True:
38
- job_response = self.get(job_start_resp.job_id)
39
- if job_response.status == "completed" or job_response.status == "failed":
40
- return job_response
42
+ try:
43
+ job_response = self.get(job_start_resp.job_id)
44
+ if (
45
+ job_response.status == "completed"
46
+ or job_response.status == "failed"
47
+ ):
48
+ return job_response
49
+ except Exception as e:
50
+ failures += 1
51
+ if failures >= POLLING_ATTEMPTS:
52
+ raise HyperbrowserError(
53
+ f"Failed to poll extract job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
54
+ )
41
55
  time.sleep(2)
@@ -0,0 +1,148 @@
1
+ import time
2
+ from typing import Optional
3
+
4
+ from hyperbrowser.models.consts import POLLING_ATTEMPTS
5
+ from ....models.scrape import (
6
+ BatchScrapeJobResponse,
7
+ GetBatchScrapeJobParams,
8
+ ScrapeJobResponse,
9
+ StartBatchScrapeJobParams,
10
+ StartBatchScrapeJobResponse,
11
+ StartScrapeJobParams,
12
+ StartScrapeJobResponse,
13
+ )
14
+ from ....exceptions import HyperbrowserError
15
+
16
+
17
+ class BatchScrapeManager:
18
+ def __init__(self, client):
19
+ self._client = client
20
+
21
+ def start(self, params: StartBatchScrapeJobParams) -> StartBatchScrapeJobResponse:
22
+ response = self._client.transport.post(
23
+ self._client._build_url("/scrape/batch"),
24
+ data=params.model_dump(exclude_none=True, by_alias=True),
25
+ )
26
+ return StartBatchScrapeJobResponse(**response.data)
27
+
28
+ def get(
29
+ self, job_id: str, params: GetBatchScrapeJobParams = GetBatchScrapeJobParams()
30
+ ) -> BatchScrapeJobResponse:
31
+ response = self._client.transport.get(
32
+ self._client._build_url(f"/scrape/batch/{job_id}"), params=params.__dict__
33
+ )
34
+ return BatchScrapeJobResponse(**response.data)
35
+
36
+ def start_and_wait(
37
+ self, params: StartBatchScrapeJobParams, return_all_pages: bool = True
38
+ ) -> BatchScrapeJobResponse:
39
+ job_start_resp = self.start(params)
40
+ job_id = job_start_resp.job_id
41
+ if not job_id:
42
+ raise HyperbrowserError("Failed to start batch scrape job")
43
+
44
+ job_response: BatchScrapeJobResponse
45
+ failures = 0
46
+ while True:
47
+ try:
48
+ job_response = self.get(
49
+ job_id, params=GetBatchScrapeJobParams(batch_size=1)
50
+ )
51
+ if (
52
+ job_response.status == "completed"
53
+ or job_response.status == "failed"
54
+ ):
55
+ break
56
+ failures = 0
57
+ except Exception as e:
58
+ failures += 1
59
+ if failures >= POLLING_ATTEMPTS:
60
+ raise HyperbrowserError(
61
+ f"Failed to poll batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
62
+ )
63
+ time.sleep(2)
64
+
65
+ failures = 0
66
+ if not return_all_pages:
67
+ while True:
68
+ try:
69
+ job_response = self.get(job_id)
70
+ return job_response
71
+ except Exception as e:
72
+ failures += 1
73
+ if failures >= POLLING_ATTEMPTS:
74
+ raise HyperbrowserError(
75
+ f"Failed to get batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
76
+ )
77
+ time.sleep(0.5)
78
+
79
+ failures = 0
80
+ job_response.current_page_batch = 0
81
+ job_response.data = []
82
+ while job_response.current_page_batch < job_response.total_page_batches:
83
+ try:
84
+ tmp_job_response = self.get(
85
+ job_start_resp.job_id,
86
+ GetBatchScrapeJobParams(
87
+ page=job_response.current_page_batch + 1, batch_size=100
88
+ ),
89
+ )
90
+ if tmp_job_response.data:
91
+ job_response.data.extend(tmp_job_response.data)
92
+ job_response.current_page_batch = tmp_job_response.current_page_batch
93
+ job_response.total_scraped_pages = tmp_job_response.total_scraped_pages
94
+ job_response.total_page_batches = tmp_job_response.total_page_batches
95
+ job_response.batch_size = tmp_job_response.batch_size
96
+ failures = 0
97
+ except Exception as e:
98
+ failures += 1
99
+ if failures >= POLLING_ATTEMPTS:
100
+ raise HyperbrowserError(
101
+ f"Failed to get batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
102
+ )
103
+ time.sleep(0.5)
104
+
105
+ return job_response
106
+
107
+
108
+ class ScrapeManager:
109
+ def __init__(self, client):
110
+ self._client = client
111
+ self.batch = BatchScrapeManager(client)
112
+
113
+ def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
114
+ response = self._client.transport.post(
115
+ self._client._build_url("/scrape"),
116
+ data=params.model_dump(exclude_none=True, by_alias=True),
117
+ )
118
+ return StartScrapeJobResponse(**response.data)
119
+
120
+ def get(self, job_id: str) -> ScrapeJobResponse:
121
+ response = self._client.transport.get(
122
+ self._client._build_url(f"/scrape/{job_id}")
123
+ )
124
+ return ScrapeJobResponse(**response.data)
125
+
126
+ def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
127
+ job_start_resp = self.start(params)
128
+ job_id = job_start_resp.job_id
129
+ if not job_id:
130
+ raise HyperbrowserError("Failed to start scrape job")
131
+
132
+ failures = 0
133
+ while True:
134
+ try:
135
+ job_response = self.get(job_id)
136
+ if (
137
+ job_response.status == "completed"
138
+ or job_response.status == "failed"
139
+ ):
140
+ return job_response
141
+ failures = 0
142
+ except Exception as e:
143
+ failures += 1
144
+ if failures >= POLLING_ATTEMPTS:
145
+ raise HyperbrowserError(
146
+ f"Failed to poll scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
147
+ )
148
+ time.sleep(2)
@@ -2,6 +2,8 @@ from typing import Literal
2
2
 
3
3
  ScrapeFormat = Literal["markdown", "html", "links", "screenshot"]
4
4
  ScrapeWaitUntil = Literal["load", "domcontentloaded", "networkidle"]
5
+ ScrapePageStatus = Literal["completed", "failed", "pending", "running"]
6
+ POLLING_ATTEMPTS = 5
5
7
 
6
8
  Country = Literal[
7
9
  "AD",
@@ -18,7 +18,7 @@ class StartCrawlJobParams(BaseModel):
18
18
  )
19
19
 
20
20
  url: str
21
- max_pages: int = Field(default=10, ge=1, serialization_alias="maxPages")
21
+ max_pages: Optional[int] = Field(default=None, ge=1, serialization_alias="maxPages")
22
22
  follow_links: bool = Field(default=True, serialization_alias="followLinks")
23
23
  ignore_sitemap: bool = Field(default=False, serialization_alias="ignoreSitemap")
24
24
  exclude_patterns: List[str] = Field(
@@ -69,7 +69,7 @@ class GetCrawlJobParams(BaseModel):
69
69
 
70
70
  page: Optional[int] = Field(default=None, serialization_alias="page")
71
71
  batch_size: Optional[int] = Field(
72
- default=20, ge=1, le=30, serialization_alias="batchSize"
72
+ default=None, ge=1, serialization_alias="batchSize"
73
73
  )
74
74
 
75
75
 
@@ -17,13 +17,17 @@ class StartExtractJobParams(BaseModel):
17
17
  )
18
18
 
19
19
  urls: List[str]
20
- prompt: Optional[str] = None
20
+ system_prompt: Optional[str] = Field(
21
+ default=None, serialization_alias="systemPrompt"
22
+ )
23
+ prompt: Optional[str] = Field(default=None, serialization_alias="prompt")
21
24
  schema_: Optional[Any] = pydantic.Field(
22
25
  None, alias="schema", serialization_alias="schema"
23
26
  )
24
27
  session_options: Optional[CreateSessionParams] = Field(
25
28
  default=None, serialization_alias="sessionOptions"
26
29
  )
30
+ max_links: Optional[int] = Field(default=None, serialization_alias="maxLinks")
27
31
 
28
32
 
29
33
  class StartExtractJobResponse(BaseModel):
@@ -1,7 +1,7 @@
1
1
  from typing import List, Literal, Optional, Union
2
2
  from pydantic import BaseModel, ConfigDict, Field
3
3
 
4
- from hyperbrowser.models.consts import ScrapeFormat, ScrapeWaitUntil
4
+ from hyperbrowser.models.consts import ScrapeFormat, ScrapePageStatus, ScrapeWaitUntil
5
5
  from hyperbrowser.models.session import CreateSessionParams
6
6
 
7
7
  ScrapeJobStatus = Literal["pending", "running", "completed", "failed"]
@@ -84,3 +84,78 @@ class ScrapeJobResponse(BaseModel):
84
84
  status: ScrapeJobStatus
85
85
  error: Optional[str] = None
86
86
  data: Optional[ScrapeJobData] = None
87
+
88
+
89
+ class StartBatchScrapeJobParams(BaseModel):
90
+ """
91
+ Parameters for creating a new batch scrape job.
92
+ """
93
+
94
+ urls: List[str]
95
+ session_options: Optional[CreateSessionParams] = Field(
96
+ default=None, serialization_alias="sessionOptions"
97
+ )
98
+ scrape_options: Optional[ScrapeOptions] = Field(
99
+ default=None, serialization_alias="scrapeOptions"
100
+ )
101
+
102
+
103
+ class ScrapedPage(BaseModel):
104
+ """
105
+ A scraped page.
106
+ """
107
+
108
+ model_config = ConfigDict(
109
+ populate_by_alias=True,
110
+ )
111
+
112
+ url: str
113
+ status: ScrapePageStatus
114
+ error: Optional[str] = None
115
+ metadata: Optional[dict[str, Union[str, list[str]]]] = None
116
+ html: Optional[str] = None
117
+ markdown: Optional[str] = None
118
+ links: Optional[List[str]] = None
119
+ screenshot: Optional[str] = None
120
+
121
+
122
+ class GetBatchScrapeJobParams(BaseModel):
123
+ """
124
+ Parameters for getting a batch scrape job.
125
+ """
126
+
127
+ page: Optional[int] = Field(default=None, serialization_alias="page")
128
+ batch_size: Optional[int] = Field(
129
+ default=None, ge=1, serialization_alias="batchSize"
130
+ )
131
+
132
+
133
+ class StartBatchScrapeJobResponse(BaseModel):
134
+ """
135
+ Response from starting a batch scrape job.
136
+ """
137
+
138
+ model_config = ConfigDict(
139
+ populate_by_alias=True,
140
+ )
141
+
142
+ job_id: str = Field(alias="jobId")
143
+
144
+
145
+ class BatchScrapeJobResponse(BaseModel):
146
+ """
147
+ Response from getting a batch scrape job.
148
+ """
149
+
150
+ model_config = ConfigDict(
151
+ populate_by_alias=True,
152
+ )
153
+
154
+ job_id: str = Field(alias="jobId")
155
+ status: ScrapeJobStatus
156
+ error: Optional[str] = None
157
+ data: Optional[List[ScrapedPage]] = Field(alias="data")
158
+ total_scraped_pages: int = Field(alias="totalScrapedPages")
159
+ total_page_batches: int = Field(alias="totalPageBatches")
160
+ current_page_batch: int = Field(alias="currentPageBatch")
161
+ batch_size: int = Field(alias="batchSize")
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "hyperbrowser"
3
- version = "0.23.0"
3
+ version = "0.24.0"
4
4
  description = "Python SDK for hyperbrowser"
5
5
  authors = ["Nikhil Shahi <nshahi1998@gmail.com>"]
6
6
  license = "MIT"
@@ -1,60 +0,0 @@
1
- import asyncio
2
- from typing import Optional
3
- from ....models.crawl import (
4
- CrawlJobResponse,
5
- GetCrawlJobParams,
6
- StartCrawlJobParams,
7
- StartCrawlJobResponse,
8
- )
9
- from ....exceptions import HyperbrowserError
10
-
11
-
12
- class CrawlManager:
13
- def __init__(self, client):
14
- self._client = client
15
-
16
- async def start(self, params: StartCrawlJobParams) -> StartCrawlJobResponse:
17
- response = await self._client.transport.post(
18
- self._client._build_url("/crawl"),
19
- data=params.model_dump(exclude_none=True, by_alias=True),
20
- )
21
- return StartCrawlJobResponse(**response.data)
22
-
23
- async def get(
24
- self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
25
- ) -> CrawlJobResponse:
26
- response = await self._client.transport.get(
27
- self._client._build_url(f"/crawl/{job_id}"), params=params.__dict__
28
- )
29
- return CrawlJobResponse(**response.data)
30
-
31
- async def start_and_wait(
32
- self, params: StartCrawlJobParams, return_all_pages: bool = True
33
- ) -> CrawlJobResponse:
34
- job_start_resp = await self.start(params)
35
- if not job_start_resp.job_id:
36
- raise HyperbrowserError("Failed to start crawl job")
37
-
38
- job_response: CrawlJobResponse
39
- while True:
40
- job_response = await self.get(job_start_resp.job_id)
41
- if job_response.status == "completed" or job_response.status == "failed":
42
- break
43
- await asyncio.sleep(2)
44
-
45
- if not return_all_pages:
46
- return job_response
47
-
48
- while job_response.current_page_batch < job_response.total_page_batches:
49
- tmp_job_response = await self.get(
50
- job_start_resp.job_id,
51
- GetCrawlJobParams(page=job_response.current_page_batch + 1),
52
- )
53
- if tmp_job_response.data:
54
- job_response.data.extend(tmp_job_response.data)
55
- job_response.current_page_batch = tmp_job_response.current_page_batch
56
- job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
57
- job_response.total_page_batches = tmp_job_response.total_page_batches
58
- job_response.batch_size = tmp_job_response.batch_size
59
- await asyncio.sleep(0.5)
60
- return job_response
@@ -1,36 +0,0 @@
1
- import asyncio
2
- from typing import Optional
3
- from ....models.scrape import (
4
- ScrapeJobResponse,
5
- StartScrapeJobParams,
6
- StartScrapeJobResponse,
7
- )
8
- from ....exceptions import HyperbrowserError
9
-
10
-
11
- class ScrapeManager:
12
- def __init__(self, client):
13
- self._client = client
14
-
15
- async def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
16
- response = await self._client.transport.post(
17
- self._client._build_url("/scrape"),
18
- data=params.model_dump(exclude_none=True, by_alias=True),
19
- )
20
- return StartScrapeJobResponse(**response.data)
21
-
22
- async def get(self, job_id: str) -> ScrapeJobResponse:
23
- response = await self._client.transport.get(
24
- self._client._build_url(f"/scrape/{job_id}")
25
- )
26
- return ScrapeJobResponse(**response.data)
27
-
28
- async def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
29
- job_start_resp = await self.start(params)
30
- if not job_start_resp.job_id:
31
- raise HyperbrowserError("Failed to start scrape job")
32
- while True:
33
- job_response = await self.get(job_start_resp.job_id)
34
- if job_response.status == "completed" or job_response.status == "failed":
35
- return job_response
36
- await asyncio.sleep(2)
@@ -1,60 +0,0 @@
1
- import time
2
- from typing import Optional
3
- from ....models.crawl import (
4
- CrawlJobResponse,
5
- GetCrawlJobParams,
6
- StartCrawlJobParams,
7
- StartCrawlJobResponse,
8
- )
9
- from ....exceptions import HyperbrowserError
10
-
11
-
12
- class CrawlManager:
13
- def __init__(self, client):
14
- self._client = client
15
-
16
- def start(self, params: StartCrawlJobParams) -> StartCrawlJobResponse:
17
- response = self._client.transport.post(
18
- self._client._build_url("/crawl"),
19
- data=params.model_dump(exclude_none=True, by_alias=True),
20
- )
21
- return StartCrawlJobResponse(**response.data)
22
-
23
- def get(
24
- self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
25
- ) -> CrawlJobResponse:
26
- response = self._client.transport.get(
27
- self._client._build_url(f"/crawl/{job_id}"), params=params.__dict__
28
- )
29
- return CrawlJobResponse(**response.data)
30
-
31
- def start_and_wait(
32
- self, params: StartCrawlJobParams, return_all_pages: bool = True
33
- ) -> CrawlJobResponse:
34
- job_start_resp = self.start(params)
35
- if not job_start_resp.job_id:
36
- raise HyperbrowserError("Failed to start crawl job")
37
-
38
- job_response: CrawlJobResponse
39
- while True:
40
- job_response = self.get(job_start_resp.job_id)
41
- if job_response.status == "completed" or job_response.status == "failed":
42
- break
43
- time.sleep(2)
44
-
45
- if not return_all_pages:
46
- return job_response
47
-
48
- while job_response.current_page_batch < job_response.total_page_batches:
49
- tmp_job_response = self.get(
50
- job_start_resp.job_id,
51
- GetCrawlJobParams(page=job_response.current_page_batch + 1),
52
- )
53
- if tmp_job_response.data:
54
- job_response.data.extend(tmp_job_response.data)
55
- job_response.current_page_batch = tmp_job_response.current_page_batch
56
- job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
57
- job_response.total_page_batches = tmp_job_response.total_page_batches
58
- job_response.batch_size = tmp_job_response.batch_size
59
- time.sleep(0.5)
60
- return job_response
@@ -1,36 +0,0 @@
1
- import time
2
- from typing import Optional
3
- from ....models.scrape import (
4
- ScrapeJobResponse,
5
- StartScrapeJobParams,
6
- StartScrapeJobResponse,
7
- )
8
- from ....exceptions import HyperbrowserError
9
-
10
-
11
- class ScrapeManager:
12
- def __init__(self, client):
13
- self._client = client
14
-
15
- def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
16
- response = self._client.transport.post(
17
- self._client._build_url("/scrape"),
18
- data=params.model_dump(exclude_none=True, by_alias=True),
19
- )
20
- return StartScrapeJobResponse(**response.data)
21
-
22
- def get(self, job_id: str) -> ScrapeJobResponse:
23
- response = self._client.transport.get(
24
- self._client._build_url(f"/scrape/{job_id}")
25
- )
26
- return ScrapeJobResponse(**response.data)
27
-
28
- def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
29
- job_start_resp = self.start(params)
30
- if not job_start_resp.job_id:
31
- raise HyperbrowserError("Failed to start scrape job")
32
- while True:
33
- job_response = self.get(job_start_resp.job_id)
34
- if job_response.status == "completed" or job_response.status == "failed":
35
- return job_response
36
- time.sleep(2)
File without changes
File without changes