hyperbrowser 0.32.0__py3-none-any.whl → 0.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hyperbrowser might be problematic. Click here for more details.
- hyperbrowser/client/async_client.py +8 -8
- hyperbrowser/client/managers/async_manager/{beta/agents → agents}/__init__.py +3 -2
- hyperbrowser/client/managers/async_manager/{beta/agents → agents}/browser_use.py +5 -3
- hyperbrowser/client/managers/async_manager/crawl.py +30 -15
- hyperbrowser/client/managers/async_manager/extract.py +15 -7
- hyperbrowser/client/managers/async_manager/profile.py +2 -1
- hyperbrowser/client/managers/async_manager/scrape.py +42 -21
- hyperbrowser/client/managers/async_manager/session.py +2 -1
- hyperbrowser/client/managers/sync_manager/{beta/agents → agents}/__init__.py +3 -2
- hyperbrowser/client/managers/sync_manager/{beta/agents → agents}/browser_use.py +5 -3
- hyperbrowser/client/managers/sync_manager/crawl.py +31 -16
- hyperbrowser/client/managers/sync_manager/extract.py +15 -7
- hyperbrowser/client/managers/sync_manager/profile.py +2 -1
- hyperbrowser/client/managers/sync_manager/scrape.py +44 -23
- hyperbrowser/client/managers/sync_manager/session.py +2 -1
- hyperbrowser/client/sync.py +8 -8
- hyperbrowser/models/__init__.py +76 -67
- hyperbrowser/models/{beta/agents → agents}/browser_use.py +4 -2
- hyperbrowser/models/crawl.py +12 -0
- hyperbrowser/models/extract.py +12 -0
- hyperbrowser/models/scrape.py +24 -0
- hyperbrowser/tools/__init__.py +47 -0
- hyperbrowser/tools/anthropic.py +18 -1
- hyperbrowser/tools/openai.py +26 -1
- hyperbrowser/tools/schema.py +74 -0
- {hyperbrowser-0.32.0.dist-info → hyperbrowser-0.34.0.dist-info}/METADATA +2 -1
- hyperbrowser-0.34.0.dist-info/RECORD +42 -0
- hyperbrowser/client/managers/async_manager/beta/__init__.py +0 -6
- hyperbrowser/client/managers/sync_manager/beta/__init__.py +0 -6
- hyperbrowser-0.32.0.dist-info/RECORD +0 -44
- {hyperbrowser-0.32.0.dist-info → hyperbrowser-0.34.0.dist-info}/LICENSE +0 -0
- {hyperbrowser-0.32.0.dist-info → hyperbrowser-0.34.0.dist-info}/WHEEL +0 -0
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
|
+
from ..config import ClientConfig
|
|
4
|
+
from ..transport.async_transport import AsyncTransport
|
|
5
|
+
from .base import HyperbrowserBase
|
|
6
|
+
from .managers.async_manager.agents import Agents
|
|
7
|
+
from .managers.async_manager.crawl import CrawlManager
|
|
8
|
+
from .managers.async_manager.extension import ExtensionManager
|
|
3
9
|
from .managers.async_manager.extract import ExtractManager
|
|
4
10
|
from .managers.async_manager.profile import ProfileManager
|
|
5
|
-
from .managers.async_manager.session import SessionManager
|
|
6
11
|
from .managers.async_manager.scrape import ScrapeManager
|
|
7
|
-
from .managers.async_manager.
|
|
8
|
-
from .managers.async_manager.extension import ExtensionManager
|
|
9
|
-
from .managers.async_manager.beta import Beta
|
|
10
|
-
from .base import HyperbrowserBase
|
|
11
|
-
from ..transport.async_transport import AsyncTransport
|
|
12
|
-
from ..config import ClientConfig
|
|
12
|
+
from .managers.async_manager.session import SessionManager
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class AsyncHyperbrowser(HyperbrowserBase):
|
|
@@ -30,7 +30,7 @@ class AsyncHyperbrowser(HyperbrowserBase):
|
|
|
30
30
|
self.extract = ExtractManager(self)
|
|
31
31
|
self.profiles = ProfileManager(self)
|
|
32
32
|
self.extensions = ExtensionManager(self)
|
|
33
|
-
self.
|
|
33
|
+
self.agents = Agents(self)
|
|
34
34
|
|
|
35
35
|
async def close(self) -> None:
|
|
36
36
|
await self.transport.close()
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
|
|
2
3
|
from hyperbrowser.exceptions import HyperbrowserError
|
|
3
|
-
|
|
4
|
+
|
|
5
|
+
from .....models import (
|
|
4
6
|
POLLING_ATTEMPTS,
|
|
5
7
|
BasicResponse,
|
|
8
|
+
BrowserUseTaskResponse,
|
|
9
|
+
BrowserUseTaskStatusResponse,
|
|
6
10
|
StartBrowserUseTaskParams,
|
|
7
11
|
StartBrowserUseTaskResponse,
|
|
8
|
-
BrowserUseTaskStatusResponse,
|
|
9
|
-
BrowserUseTaskResponse,
|
|
10
12
|
)
|
|
11
13
|
|
|
12
14
|
|
|
@@ -3,6 +3,8 @@ import asyncio
|
|
|
3
3
|
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
4
4
|
from ....models.crawl import (
|
|
5
5
|
CrawlJobResponse,
|
|
6
|
+
CrawlJobStatus,
|
|
7
|
+
CrawlJobStatusResponse,
|
|
6
8
|
GetCrawlJobParams,
|
|
7
9
|
StartCrawlJobParams,
|
|
8
10
|
StartCrawlJobResponse,
|
|
@@ -21,11 +23,18 @@ class CrawlManager:
|
|
|
21
23
|
)
|
|
22
24
|
return StartCrawlJobResponse(**response.data)
|
|
23
25
|
|
|
26
|
+
async def get_status(self, job_id: str) -> CrawlJobStatusResponse:
|
|
27
|
+
response = await self._client.transport.get(
|
|
28
|
+
self._client._build_url(f"/crawl/{job_id}/status")
|
|
29
|
+
)
|
|
30
|
+
return CrawlJobStatusResponse(**response.data)
|
|
31
|
+
|
|
24
32
|
async def get(
|
|
25
33
|
self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
|
|
26
34
|
) -> CrawlJobResponse:
|
|
27
35
|
response = await self._client.transport.get(
|
|
28
|
-
self._client._build_url(f"/crawl/{job_id}"),
|
|
36
|
+
self._client._build_url(f"/crawl/{job_id}"),
|
|
37
|
+
params=params.model_dump(exclude_none=True, by_alias=True),
|
|
29
38
|
)
|
|
30
39
|
return CrawlJobResponse(**response.data)
|
|
31
40
|
|
|
@@ -37,18 +46,13 @@ class CrawlManager:
|
|
|
37
46
|
if not job_id:
|
|
38
47
|
raise HyperbrowserError("Failed to start crawl job")
|
|
39
48
|
|
|
40
|
-
|
|
49
|
+
job_status: CrawlJobStatus = "pending"
|
|
41
50
|
failures = 0
|
|
42
51
|
while True:
|
|
43
52
|
try:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
)
|
|
48
|
-
if (
|
|
49
|
-
job_response.status == "completed"
|
|
50
|
-
or job_response.status == "failed"
|
|
51
|
-
):
|
|
53
|
+
job_status_resp = await self.get_status(job_id)
|
|
54
|
+
job_status = job_status_resp.status
|
|
55
|
+
if job_status == "completed" or job_status == "failed":
|
|
52
56
|
break
|
|
53
57
|
except Exception as e:
|
|
54
58
|
failures += 1
|
|
@@ -62,8 +66,7 @@ class CrawlManager:
|
|
|
62
66
|
if not return_all_pages:
|
|
63
67
|
while True:
|
|
64
68
|
try:
|
|
65
|
-
|
|
66
|
-
return job_response
|
|
69
|
+
return await self.get(job_id)
|
|
67
70
|
except Exception as e:
|
|
68
71
|
failures += 1
|
|
69
72
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -73,9 +76,20 @@ class CrawlManager:
|
|
|
73
76
|
await asyncio.sleep(0.5)
|
|
74
77
|
|
|
75
78
|
failures = 0
|
|
76
|
-
job_response
|
|
77
|
-
|
|
78
|
-
|
|
79
|
+
job_response = CrawlJobResponse(
|
|
80
|
+
jobId=job_id,
|
|
81
|
+
status=job_status,
|
|
82
|
+
data=[],
|
|
83
|
+
currentPageBatch=0,
|
|
84
|
+
totalPageBatches=0,
|
|
85
|
+
totalCrawledPages=0,
|
|
86
|
+
batchSize=100,
|
|
87
|
+
)
|
|
88
|
+
first_check = True
|
|
89
|
+
while (
|
|
90
|
+
first_check
|
|
91
|
+
or job_response.current_page_batch < job_response.total_page_batches
|
|
92
|
+
):
|
|
79
93
|
try:
|
|
80
94
|
tmp_job_response = await self.get(
|
|
81
95
|
job_start_resp.job_id,
|
|
@@ -90,6 +104,7 @@ class CrawlManager:
|
|
|
90
104
|
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
91
105
|
job_response.batch_size = tmp_job_response.batch_size
|
|
92
106
|
failures = 0
|
|
107
|
+
first_check = False
|
|
93
108
|
except Exception as e:
|
|
94
109
|
failures += 1
|
|
95
110
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -3,9 +3,11 @@ from hyperbrowser.exceptions import HyperbrowserError
|
|
|
3
3
|
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
4
4
|
from hyperbrowser.models.extract import (
|
|
5
5
|
ExtractJobResponse,
|
|
6
|
+
ExtractJobStatusResponse,
|
|
6
7
|
StartExtractJobParams,
|
|
7
8
|
StartExtractJobResponse,
|
|
8
9
|
)
|
|
10
|
+
import jsonref
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class ExtractManager:
|
|
@@ -17,7 +19,9 @@ class ExtractManager:
|
|
|
17
19
|
raise HyperbrowserError("Either schema or prompt must be provided")
|
|
18
20
|
if params.schema_:
|
|
19
21
|
if hasattr(params.schema_, "model_json_schema"):
|
|
20
|
-
params.schema_ =
|
|
22
|
+
params.schema_ = jsonref.replace_refs(
|
|
23
|
+
params.schema_.model_json_schema(), proxies=False, lazy_load=False
|
|
24
|
+
)
|
|
21
25
|
|
|
22
26
|
response = await self._client.transport.post(
|
|
23
27
|
self._client._build_url("/extract"),
|
|
@@ -25,6 +29,12 @@ class ExtractManager:
|
|
|
25
29
|
)
|
|
26
30
|
return StartExtractJobResponse(**response.data)
|
|
27
31
|
|
|
32
|
+
async def get_status(self, job_id: str) -> ExtractJobStatusResponse:
|
|
33
|
+
response = await self._client.transport.get(
|
|
34
|
+
self._client._build_url(f"/extract/{job_id}/status")
|
|
35
|
+
)
|
|
36
|
+
return ExtractJobStatusResponse(**response.data)
|
|
37
|
+
|
|
28
38
|
async def get(self, job_id: str) -> ExtractJobResponse:
|
|
29
39
|
response = await self._client.transport.get(
|
|
30
40
|
self._client._build_url(f"/extract/{job_id}")
|
|
@@ -40,12 +50,10 @@ class ExtractManager:
|
|
|
40
50
|
failures = 0
|
|
41
51
|
while True:
|
|
42
52
|
try:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
):
|
|
48
|
-
return job_response
|
|
53
|
+
job_status_resp = await self.get_status(job_id)
|
|
54
|
+
job_status = job_status_resp.status
|
|
55
|
+
if job_status == "completed" or job_status == "failed":
|
|
56
|
+
return await self.get(job_id)
|
|
49
57
|
failures = 0
|
|
50
58
|
except Exception as e:
|
|
51
59
|
failures += 1
|
|
@@ -33,6 +33,7 @@ class ProfileManager:
|
|
|
33
33
|
self, params: ProfileListParams = ProfileListParams()
|
|
34
34
|
) -> ProfileListResponse:
|
|
35
35
|
response = await self._client.transport.get(
|
|
36
|
-
self._client._build_url("/profiles"),
|
|
36
|
+
self._client._build_url("/profiles"),
|
|
37
|
+
params=params.model_dump(exclude_none=True, by_alias=True),
|
|
37
38
|
)
|
|
38
39
|
return ProfileListResponse(**response.data)
|
|
@@ -4,8 +4,11 @@ from typing import Optional
|
|
|
4
4
|
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
5
5
|
from ....models.scrape import (
|
|
6
6
|
BatchScrapeJobResponse,
|
|
7
|
+
BatchScrapeJobStatusResponse,
|
|
7
8
|
GetBatchScrapeJobParams,
|
|
8
9
|
ScrapeJobResponse,
|
|
10
|
+
ScrapeJobStatus,
|
|
11
|
+
ScrapeJobStatusResponse,
|
|
9
12
|
StartBatchScrapeJobParams,
|
|
10
13
|
StartBatchScrapeJobResponse,
|
|
11
14
|
StartScrapeJobParams,
|
|
@@ -27,11 +30,18 @@ class BatchScrapeManager:
|
|
|
27
30
|
)
|
|
28
31
|
return StartBatchScrapeJobResponse(**response.data)
|
|
29
32
|
|
|
33
|
+
async def get_status(self, job_id: str) -> BatchScrapeJobStatusResponse:
|
|
34
|
+
response = await self._client.transport.get(
|
|
35
|
+
self._client._build_url(f"/scrape/batch/{job_id}/status")
|
|
36
|
+
)
|
|
37
|
+
return BatchScrapeJobStatusResponse(**response.data)
|
|
38
|
+
|
|
30
39
|
async def get(
|
|
31
40
|
self, job_id: str, params: GetBatchScrapeJobParams = GetBatchScrapeJobParams()
|
|
32
41
|
) -> BatchScrapeJobResponse:
|
|
33
42
|
response = await self._client.transport.get(
|
|
34
|
-
self._client._build_url(f"/scrape/batch/{job_id}"),
|
|
43
|
+
self._client._build_url(f"/scrape/batch/{job_id}"),
|
|
44
|
+
params=params.model_dump(exclude_none=True, by_alias=True),
|
|
35
45
|
)
|
|
36
46
|
return BatchScrapeJobResponse(**response.data)
|
|
37
47
|
|
|
@@ -43,19 +53,14 @@ class BatchScrapeManager:
|
|
|
43
53
|
if not job_id:
|
|
44
54
|
raise HyperbrowserError("Failed to start batch scrape job")
|
|
45
55
|
|
|
46
|
-
|
|
56
|
+
job_status: ScrapeJobStatus = "pending"
|
|
47
57
|
failures = 0
|
|
48
58
|
while True:
|
|
49
59
|
try:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if (
|
|
54
|
-
job_response.status == "completed"
|
|
55
|
-
or job_response.status == "failed"
|
|
56
|
-
):
|
|
60
|
+
job_status_resp = await self.get_status(job_id)
|
|
61
|
+
job_status = job_status_resp.status
|
|
62
|
+
if job_status == "completed" or job_status == "failed":
|
|
57
63
|
break
|
|
58
|
-
failures = 0
|
|
59
64
|
except Exception as e:
|
|
60
65
|
failures += 1
|
|
61
66
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -68,8 +73,7 @@ class BatchScrapeManager:
|
|
|
68
73
|
if not return_all_pages:
|
|
69
74
|
while True:
|
|
70
75
|
try:
|
|
71
|
-
|
|
72
|
-
return job_response
|
|
76
|
+
return await self.get(job_id)
|
|
73
77
|
except Exception as e:
|
|
74
78
|
failures += 1
|
|
75
79
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -79,9 +83,21 @@ class BatchScrapeManager:
|
|
|
79
83
|
await asyncio.sleep(0.5)
|
|
80
84
|
|
|
81
85
|
failures = 0
|
|
82
|
-
job_response
|
|
83
|
-
|
|
84
|
-
|
|
86
|
+
job_response = BatchScrapeJobResponse(
|
|
87
|
+
jobId=job_id,
|
|
88
|
+
status=job_status,
|
|
89
|
+
data=[],
|
|
90
|
+
currentPageBatch=0,
|
|
91
|
+
totalPageBatches=0,
|
|
92
|
+
totalScrapedPages=0,
|
|
93
|
+
batchSize=100,
|
|
94
|
+
)
|
|
95
|
+
first_check = True
|
|
96
|
+
|
|
97
|
+
while (
|
|
98
|
+
first_check
|
|
99
|
+
or job_response.current_page_batch < job_response.total_page_batches
|
|
100
|
+
):
|
|
85
101
|
try:
|
|
86
102
|
tmp_job_response = await self.get(
|
|
87
103
|
job_id,
|
|
@@ -96,6 +112,7 @@ class BatchScrapeManager:
|
|
|
96
112
|
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
97
113
|
job_response.batch_size = tmp_job_response.batch_size
|
|
98
114
|
failures = 0
|
|
115
|
+
first_check = False
|
|
99
116
|
except Exception as e:
|
|
100
117
|
failures += 1
|
|
101
118
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -119,6 +136,12 @@ class ScrapeManager:
|
|
|
119
136
|
)
|
|
120
137
|
return StartScrapeJobResponse(**response.data)
|
|
121
138
|
|
|
139
|
+
async def get_status(self, job_id: str) -> ScrapeJobStatusResponse:
|
|
140
|
+
response = await self._client.transport.get(
|
|
141
|
+
self._client._build_url(f"/scrape/{job_id}/status")
|
|
142
|
+
)
|
|
143
|
+
return ScrapeJobStatusResponse(**response.data)
|
|
144
|
+
|
|
122
145
|
async def get(self, job_id: str) -> ScrapeJobResponse:
|
|
123
146
|
response = await self._client.transport.get(
|
|
124
147
|
self._client._build_url(f"/scrape/{job_id}")
|
|
@@ -134,12 +157,10 @@ class ScrapeManager:
|
|
|
134
157
|
failures = 0
|
|
135
158
|
while True:
|
|
136
159
|
try:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
):
|
|
142
|
-
return job_response
|
|
160
|
+
job_status_resp = await self.get_status(job_id)
|
|
161
|
+
job_status = job_status_resp.status
|
|
162
|
+
if job_status == "completed" or job_status == "failed":
|
|
163
|
+
return await self.get(job_id)
|
|
143
164
|
failures = 0
|
|
144
165
|
except Exception as e:
|
|
145
166
|
failures += 1
|
|
@@ -42,7 +42,8 @@ class SessionManager:
|
|
|
42
42
|
self, params: SessionListParams = SessionListParams()
|
|
43
43
|
) -> SessionListResponse:
|
|
44
44
|
response = await self._client.transport.get(
|
|
45
|
-
self._client._build_url("/sessions"),
|
|
45
|
+
self._client._build_url("/sessions"),
|
|
46
|
+
params=params.model_dump(exclude_none=True, by_alias=True),
|
|
46
47
|
)
|
|
47
48
|
return SessionListResponse(**response.data)
|
|
48
49
|
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import time
|
|
2
|
+
|
|
2
3
|
from hyperbrowser.exceptions import HyperbrowserError
|
|
3
|
-
|
|
4
|
+
|
|
5
|
+
from .....models import (
|
|
4
6
|
POLLING_ATTEMPTS,
|
|
5
7
|
BasicResponse,
|
|
8
|
+
BrowserUseTaskResponse,
|
|
9
|
+
BrowserUseTaskStatusResponse,
|
|
6
10
|
StartBrowserUseTaskParams,
|
|
7
11
|
StartBrowserUseTaskResponse,
|
|
8
|
-
BrowserUseTaskStatusResponse,
|
|
9
|
-
BrowserUseTaskResponse,
|
|
10
12
|
)
|
|
11
13
|
|
|
12
14
|
|
|
@@ -4,6 +4,8 @@ from typing import Optional
|
|
|
4
4
|
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
5
5
|
from ....models.crawl import (
|
|
6
6
|
CrawlJobResponse,
|
|
7
|
+
CrawlJobStatus,
|
|
8
|
+
CrawlJobStatusResponse,
|
|
7
9
|
GetCrawlJobParams,
|
|
8
10
|
StartCrawlJobParams,
|
|
9
11
|
StartCrawlJobResponse,
|
|
@@ -22,11 +24,18 @@ class CrawlManager:
|
|
|
22
24
|
)
|
|
23
25
|
return StartCrawlJobResponse(**response.data)
|
|
24
26
|
|
|
27
|
+
def get_status(self, job_id: str) -> CrawlJobStatusResponse:
|
|
28
|
+
response = self._client.transport.get(
|
|
29
|
+
self._client._build_url(f"/crawl/{job_id}/status")
|
|
30
|
+
)
|
|
31
|
+
return CrawlJobStatusResponse(**response.data)
|
|
32
|
+
|
|
25
33
|
def get(
|
|
26
34
|
self, job_id: str, params: GetCrawlJobParams = GetCrawlJobParams()
|
|
27
35
|
) -> CrawlJobResponse:
|
|
28
36
|
response = self._client.transport.get(
|
|
29
|
-
self._client._build_url(f"/crawl/{job_id}"),
|
|
37
|
+
self._client._build_url(f"/crawl/{job_id}"),
|
|
38
|
+
params=params.model_dump(exclude_none=True, by_alias=True),
|
|
30
39
|
)
|
|
31
40
|
return CrawlJobResponse(**response.data)
|
|
32
41
|
|
|
@@ -38,18 +47,13 @@ class CrawlManager:
|
|
|
38
47
|
if not job_id:
|
|
39
48
|
raise HyperbrowserError("Failed to start crawl job")
|
|
40
49
|
|
|
41
|
-
|
|
50
|
+
job_status: CrawlJobStatus = "pending"
|
|
42
51
|
failures = 0
|
|
43
52
|
while True:
|
|
44
53
|
try:
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
)
|
|
49
|
-
if (
|
|
50
|
-
job_response.status == "completed"
|
|
51
|
-
or job_response.status == "failed"
|
|
52
|
-
):
|
|
54
|
+
job_status_resp = self.get_status(job_id)
|
|
55
|
+
job_status = job_status_resp.status
|
|
56
|
+
if job_status == "completed" or job_status == "failed":
|
|
53
57
|
break
|
|
54
58
|
except Exception as e:
|
|
55
59
|
failures += 1
|
|
@@ -63,8 +67,7 @@ class CrawlManager:
|
|
|
63
67
|
if not return_all_pages:
|
|
64
68
|
while True:
|
|
65
69
|
try:
|
|
66
|
-
|
|
67
|
-
return job_response
|
|
70
|
+
return self.get(job_id)
|
|
68
71
|
except Exception as e:
|
|
69
72
|
failures += 1
|
|
70
73
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -74,12 +77,23 @@ class CrawlManager:
|
|
|
74
77
|
time.sleep(0.5)
|
|
75
78
|
|
|
76
79
|
failures = 0
|
|
77
|
-
job_response
|
|
78
|
-
|
|
79
|
-
|
|
80
|
+
job_response = CrawlJobResponse(
|
|
81
|
+
jobId=job_id,
|
|
82
|
+
status=job_status,
|
|
83
|
+
data=[],
|
|
84
|
+
currentPageBatch=0,
|
|
85
|
+
totalPageBatches=0,
|
|
86
|
+
totalCrawledPages=0,
|
|
87
|
+
batchSize=100,
|
|
88
|
+
)
|
|
89
|
+
first_check = True
|
|
90
|
+
while (
|
|
91
|
+
first_check
|
|
92
|
+
or job_response.current_page_batch < job_response.total_page_batches
|
|
93
|
+
):
|
|
80
94
|
try:
|
|
81
95
|
tmp_job_response = self.get(
|
|
82
|
-
job_id,
|
|
96
|
+
job_start_resp.job_id,
|
|
83
97
|
GetCrawlJobParams(
|
|
84
98
|
page=job_response.current_page_batch + 1, batch_size=100
|
|
85
99
|
),
|
|
@@ -91,6 +105,7 @@ class CrawlManager:
|
|
|
91
105
|
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
92
106
|
job_response.batch_size = tmp_job_response.batch_size
|
|
93
107
|
failures = 0
|
|
108
|
+
first_check = False
|
|
94
109
|
except Exception as e:
|
|
95
110
|
failures += 1
|
|
96
111
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -3,9 +3,11 @@ from hyperbrowser.exceptions import HyperbrowserError
|
|
|
3
3
|
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
4
4
|
from hyperbrowser.models.extract import (
|
|
5
5
|
ExtractJobResponse,
|
|
6
|
+
ExtractJobStatusResponse,
|
|
6
7
|
StartExtractJobParams,
|
|
7
8
|
StartExtractJobResponse,
|
|
8
9
|
)
|
|
10
|
+
import jsonref
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class ExtractManager:
|
|
@@ -17,7 +19,9 @@ class ExtractManager:
|
|
|
17
19
|
raise HyperbrowserError("Either schema or prompt must be provided")
|
|
18
20
|
if params.schema_:
|
|
19
21
|
if hasattr(params.schema_, "model_json_schema"):
|
|
20
|
-
params.schema_ =
|
|
22
|
+
params.schema_ = jsonref.replace_refs(
|
|
23
|
+
params.schema_.model_json_schema(), proxies=False, lazy_load=False
|
|
24
|
+
)
|
|
21
25
|
|
|
22
26
|
response = self._client.transport.post(
|
|
23
27
|
self._client._build_url("/extract"),
|
|
@@ -25,6 +29,12 @@ class ExtractManager:
|
|
|
25
29
|
)
|
|
26
30
|
return StartExtractJobResponse(**response.data)
|
|
27
31
|
|
|
32
|
+
def get_status(self, job_id: str) -> ExtractJobStatusResponse:
|
|
33
|
+
response = self._client.transport.get(
|
|
34
|
+
self._client._build_url(f"/extract/{job_id}/status")
|
|
35
|
+
)
|
|
36
|
+
return ExtractJobStatusResponse(**response.data)
|
|
37
|
+
|
|
28
38
|
def get(self, job_id: str) -> ExtractJobResponse:
|
|
29
39
|
response = self._client.transport.get(
|
|
30
40
|
self._client._build_url(f"/extract/{job_id}")
|
|
@@ -40,12 +50,10 @@ class ExtractManager:
|
|
|
40
50
|
failures = 0
|
|
41
51
|
while True:
|
|
42
52
|
try:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
):
|
|
48
|
-
return job_response
|
|
53
|
+
job_status_resp = self.get_status(job_id)
|
|
54
|
+
job_status = job_status_resp.status
|
|
55
|
+
if job_status == "completed" or job_status == "failed":
|
|
56
|
+
return self.get(job_id)
|
|
49
57
|
except Exception as e:
|
|
50
58
|
failures += 1
|
|
51
59
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -33,6 +33,7 @@ class ProfileManager:
|
|
|
33
33
|
self, params: ProfileListParams = ProfileListParams()
|
|
34
34
|
) -> ProfileListResponse:
|
|
35
35
|
response = self._client.transport.get(
|
|
36
|
-
self._client._build_url("/profiles"),
|
|
36
|
+
self._client._build_url("/profiles"),
|
|
37
|
+
params=params.model_dump(exclude_none=True, by_alias=True),
|
|
37
38
|
)
|
|
38
39
|
return ProfileListResponse(**response.data)
|
|
@@ -4,8 +4,11 @@ from typing import Optional
|
|
|
4
4
|
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
5
5
|
from ....models.scrape import (
|
|
6
6
|
BatchScrapeJobResponse,
|
|
7
|
+
BatchScrapeJobStatusResponse,
|
|
7
8
|
GetBatchScrapeJobParams,
|
|
8
9
|
ScrapeJobResponse,
|
|
10
|
+
ScrapeJobStatus,
|
|
11
|
+
ScrapeJobStatusResponse,
|
|
9
12
|
StartBatchScrapeJobParams,
|
|
10
13
|
StartBatchScrapeJobResponse,
|
|
11
14
|
StartScrapeJobParams,
|
|
@@ -25,11 +28,18 @@ class BatchScrapeManager:
|
|
|
25
28
|
)
|
|
26
29
|
return StartBatchScrapeJobResponse(**response.data)
|
|
27
30
|
|
|
31
|
+
def get_status(self, job_id: str) -> BatchScrapeJobStatusResponse:
|
|
32
|
+
response = self._client.transport.get(
|
|
33
|
+
self._client._build_url(f"/scrape/batch/{job_id}/status")
|
|
34
|
+
)
|
|
35
|
+
return BatchScrapeJobStatusResponse(**response.data)
|
|
36
|
+
|
|
28
37
|
def get(
|
|
29
38
|
self, job_id: str, params: GetBatchScrapeJobParams = GetBatchScrapeJobParams()
|
|
30
39
|
) -> BatchScrapeJobResponse:
|
|
31
40
|
response = self._client.transport.get(
|
|
32
|
-
self._client._build_url(f"/scrape/batch/{job_id}"),
|
|
41
|
+
self._client._build_url(f"/scrape/batch/{job_id}"),
|
|
42
|
+
params=params.model_dump(exclude_none=True, by_alias=True),
|
|
33
43
|
)
|
|
34
44
|
return BatchScrapeJobResponse(**response.data)
|
|
35
45
|
|
|
@@ -41,19 +51,14 @@ class BatchScrapeManager:
|
|
|
41
51
|
if not job_id:
|
|
42
52
|
raise HyperbrowserError("Failed to start batch scrape job")
|
|
43
53
|
|
|
44
|
-
|
|
54
|
+
job_status: ScrapeJobStatus = "pending"
|
|
45
55
|
failures = 0
|
|
46
56
|
while True:
|
|
47
57
|
try:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
if (
|
|
52
|
-
job_response.status == "completed"
|
|
53
|
-
or job_response.status == "failed"
|
|
54
|
-
):
|
|
58
|
+
job_status_resp = self.get_status(job_id)
|
|
59
|
+
job_status = job_status_resp.status
|
|
60
|
+
if job_status == "completed" or job_status == "failed":
|
|
55
61
|
break
|
|
56
|
-
failures = 0
|
|
57
62
|
except Exception as e:
|
|
58
63
|
failures += 1
|
|
59
64
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -66,8 +71,7 @@ class BatchScrapeManager:
|
|
|
66
71
|
if not return_all_pages:
|
|
67
72
|
while True:
|
|
68
73
|
try:
|
|
69
|
-
|
|
70
|
-
return job_response
|
|
74
|
+
return self.get(job_id)
|
|
71
75
|
except Exception as e:
|
|
72
76
|
failures += 1
|
|
73
77
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -77,13 +81,25 @@ class BatchScrapeManager:
|
|
|
77
81
|
time.sleep(0.5)
|
|
78
82
|
|
|
79
83
|
failures = 0
|
|
80
|
-
job_response
|
|
81
|
-
|
|
82
|
-
|
|
84
|
+
job_response = BatchScrapeJobResponse(
|
|
85
|
+
jobId=job_id,
|
|
86
|
+
status=job_status,
|
|
87
|
+
data=[],
|
|
88
|
+
currentPageBatch=0,
|
|
89
|
+
totalPageBatches=0,
|
|
90
|
+
totalScrapedPages=0,
|
|
91
|
+
batchSize=100,
|
|
92
|
+
)
|
|
93
|
+
first_check = True
|
|
94
|
+
|
|
95
|
+
while (
|
|
96
|
+
first_check
|
|
97
|
+
or job_response.current_page_batch < job_response.total_page_batches
|
|
98
|
+
):
|
|
83
99
|
try:
|
|
84
100
|
tmp_job_response = self.get(
|
|
85
|
-
|
|
86
|
-
GetBatchScrapeJobParams(
|
|
101
|
+
job_id,
|
|
102
|
+
params=GetBatchScrapeJobParams(
|
|
87
103
|
page=job_response.current_page_batch + 1, batch_size=100
|
|
88
104
|
),
|
|
89
105
|
)
|
|
@@ -94,6 +110,7 @@ class BatchScrapeManager:
|
|
|
94
110
|
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
95
111
|
job_response.batch_size = tmp_job_response.batch_size
|
|
96
112
|
failures = 0
|
|
113
|
+
first_check = False
|
|
97
114
|
except Exception as e:
|
|
98
115
|
failures += 1
|
|
99
116
|
if failures >= POLLING_ATTEMPTS:
|
|
@@ -117,6 +134,12 @@ class ScrapeManager:
|
|
|
117
134
|
)
|
|
118
135
|
return StartScrapeJobResponse(**response.data)
|
|
119
136
|
|
|
137
|
+
def get_status(self, job_id: str) -> ScrapeJobStatusResponse:
|
|
138
|
+
response = self._client.transport.get(
|
|
139
|
+
self._client._build_url(f"/scrape/{job_id}/status")
|
|
140
|
+
)
|
|
141
|
+
return ScrapeJobStatusResponse(**response.data)
|
|
142
|
+
|
|
120
143
|
def get(self, job_id: str) -> ScrapeJobResponse:
|
|
121
144
|
response = self._client.transport.get(
|
|
122
145
|
self._client._build_url(f"/scrape/{job_id}")
|
|
@@ -132,12 +155,10 @@ class ScrapeManager:
|
|
|
132
155
|
failures = 0
|
|
133
156
|
while True:
|
|
134
157
|
try:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
):
|
|
140
|
-
return job_response
|
|
158
|
+
job_status_resp = self.get_status(job_id)
|
|
159
|
+
job_status = job_status_resp.status
|
|
160
|
+
if job_status == "completed" or job_status == "failed":
|
|
161
|
+
return self.get(job_id)
|
|
141
162
|
failures = 0
|
|
142
163
|
except Exception as e:
|
|
143
164
|
failures += 1
|