hyperbrowser 0.22.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hyperbrowser might be problematic. Click here for more details.
- hyperbrowser/client/managers/async_manager/crawl.py +57 -16
- hyperbrowser/client/managers/async_manager/extract.py +19 -4
- hyperbrowser/client/managers/async_manager/scrape.py +118 -4
- hyperbrowser/client/managers/sync_manager/crawl.py +57 -15
- hyperbrowser/client/managers/sync_manager/extract.py +18 -4
- hyperbrowser/client/managers/sync_manager/scrape.py +116 -4
- hyperbrowser/models/consts.py +3 -0
- hyperbrowser/models/crawl.py +2 -2
- hyperbrowser/models/extract.py +5 -1
- hyperbrowser/models/scrape.py +79 -1
- {hyperbrowser-0.22.0.dist-info → hyperbrowser-0.24.0.dist-info}/METADATA +1 -1
- {hyperbrowser-0.22.0.dist-info → hyperbrowser-0.24.0.dist-info}/RECORD +14 -14
- {hyperbrowser-0.22.0.dist-info → hyperbrowser-0.24.0.dist-info}/LICENSE +0 -0
- {hyperbrowser-0.22.0.dist-info → hyperbrowser-0.24.0.dist-info}/WHEEL +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
3
4
|
from ....models.crawl import (
|
|
4
5
|
CrawlJobResponse,
|
|
5
6
|
GetCrawlJobParams,
|
|
@@ -32,29 +33,69 @@ class CrawlManager:
|
|
|
32
33
|
self, params: StartCrawlJobParams, return_all_pages: bool = True
|
|
33
34
|
) -> CrawlJobResponse:
|
|
34
35
|
job_start_resp = await self.start(params)
|
|
35
|
-
|
|
36
|
+
job_id = job_start_resp.job_id
|
|
37
|
+
if not job_id:
|
|
36
38
|
raise HyperbrowserError("Failed to start crawl job")
|
|
37
39
|
|
|
38
40
|
job_response: CrawlJobResponse
|
|
41
|
+
failures = 0
|
|
39
42
|
while True:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
+
try:
|
|
44
|
+
job_response = await self.get(
|
|
45
|
+
job_id,
|
|
46
|
+
params=GetCrawlJobParams(batch_size=1),
|
|
47
|
+
)
|
|
48
|
+
if (
|
|
49
|
+
job_response.status == "completed"
|
|
50
|
+
or job_response.status == "failed"
|
|
51
|
+
):
|
|
52
|
+
break
|
|
53
|
+
except Exception as e:
|
|
54
|
+
failures += 1
|
|
55
|
+
if failures >= POLLING_ATTEMPTS:
|
|
56
|
+
raise HyperbrowserError(
|
|
57
|
+
f"Failed to poll crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
58
|
+
)
|
|
43
59
|
await asyncio.sleep(2)
|
|
44
60
|
|
|
61
|
+
failures = 0
|
|
45
62
|
if not return_all_pages:
|
|
46
|
-
|
|
63
|
+
while True:
|
|
64
|
+
try:
|
|
65
|
+
job_response = await self.get(job_id)
|
|
66
|
+
return job_response
|
|
67
|
+
except Exception as e:
|
|
68
|
+
failures += 1
|
|
69
|
+
if failures >= POLLING_ATTEMPTS:
|
|
70
|
+
raise HyperbrowserError(
|
|
71
|
+
f"Failed to get crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
72
|
+
)
|
|
73
|
+
await asyncio.sleep(0.5)
|
|
47
74
|
|
|
75
|
+
failures = 0
|
|
76
|
+
job_response.current_page_batch = 0
|
|
77
|
+
job_response.data = []
|
|
48
78
|
while job_response.current_page_batch < job_response.total_page_batches:
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
79
|
+
try:
|
|
80
|
+
tmp_job_response = await self.get(
|
|
81
|
+
job_start_resp.job_id,
|
|
82
|
+
GetCrawlJobParams(
|
|
83
|
+
page=job_response.current_page_batch + 1, batch_size=100
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
if tmp_job_response.data:
|
|
87
|
+
job_response.data.extend(tmp_job_response.data)
|
|
88
|
+
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
89
|
+
job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
|
|
90
|
+
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
91
|
+
job_response.batch_size = tmp_job_response.batch_size
|
|
92
|
+
failures = 0
|
|
93
|
+
except Exception as e:
|
|
94
|
+
failures += 1
|
|
95
|
+
if failures >= POLLING_ATTEMPTS:
|
|
96
|
+
raise HyperbrowserError(
|
|
97
|
+
f"Failed to get crawl batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
98
|
+
)
|
|
59
99
|
await asyncio.sleep(0.5)
|
|
100
|
+
|
|
60
101
|
return job_response
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from hyperbrowser.exceptions import HyperbrowserError
|
|
3
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
3
4
|
from hyperbrowser.models.extract import (
|
|
4
5
|
ExtractJobResponse,
|
|
5
6
|
StartExtractJobParams,
|
|
@@ -32,10 +33,24 @@ class ExtractManager:
|
|
|
32
33
|
|
|
33
34
|
async def start_and_wait(self, params: StartExtractJobParams) -> ExtractJobResponse:
|
|
34
35
|
job_start_resp = await self.start(params)
|
|
35
|
-
|
|
36
|
+
job_id = job_start_resp.job_id
|
|
37
|
+
if not job_id:
|
|
36
38
|
raise HyperbrowserError("Failed to start extract job")
|
|
39
|
+
|
|
40
|
+
failures = 0
|
|
37
41
|
while True:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
try:
|
|
43
|
+
job_response = await self.get(job_id)
|
|
44
|
+
if (
|
|
45
|
+
job_response.status == "completed"
|
|
46
|
+
or job_response.status == "failed"
|
|
47
|
+
):
|
|
48
|
+
return job_response
|
|
49
|
+
failures = 0
|
|
50
|
+
except Exception as e:
|
|
51
|
+
failures += 1
|
|
52
|
+
if failures >= POLLING_ATTEMPTS:
|
|
53
|
+
raise HyperbrowserError(
|
|
54
|
+
f"Failed to poll extract job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
55
|
+
)
|
|
41
56
|
await asyncio.sleep(2)
|
|
@@ -1,16 +1,116 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
3
5
|
from ....models.scrape import (
|
|
6
|
+
BatchScrapeJobResponse,
|
|
7
|
+
GetBatchScrapeJobParams,
|
|
4
8
|
ScrapeJobResponse,
|
|
9
|
+
StartBatchScrapeJobParams,
|
|
10
|
+
StartBatchScrapeJobResponse,
|
|
5
11
|
StartScrapeJobParams,
|
|
6
12
|
StartScrapeJobResponse,
|
|
7
13
|
)
|
|
8
14
|
from ....exceptions import HyperbrowserError
|
|
9
15
|
|
|
10
16
|
|
|
17
|
+
class BatchScrapeManager:
|
|
18
|
+
def __init__(self, client):
|
|
19
|
+
self._client = client
|
|
20
|
+
|
|
21
|
+
async def start(
|
|
22
|
+
self, params: StartBatchScrapeJobParams
|
|
23
|
+
) -> StartBatchScrapeJobResponse:
|
|
24
|
+
response = await self._client.transport.post(
|
|
25
|
+
self._client._build_url("/scrape/batch"),
|
|
26
|
+
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
27
|
+
)
|
|
28
|
+
return StartBatchScrapeJobResponse(**response.data)
|
|
29
|
+
|
|
30
|
+
async def get(
|
|
31
|
+
self, job_id: str, params: GetBatchScrapeJobParams = GetBatchScrapeJobParams()
|
|
32
|
+
) -> BatchScrapeJobResponse:
|
|
33
|
+
response = await self._client.transport.get(
|
|
34
|
+
self._client._build_url(f"/scrape/batch/{job_id}"), params=params.__dict__
|
|
35
|
+
)
|
|
36
|
+
return BatchScrapeJobResponse(**response.data)
|
|
37
|
+
|
|
38
|
+
async def start_and_wait(
|
|
39
|
+
self, params: StartBatchScrapeJobParams, return_all_pages: bool = True
|
|
40
|
+
) -> BatchScrapeJobResponse:
|
|
41
|
+
job_start_resp = await self.start(params)
|
|
42
|
+
job_id = job_start_resp.job_id
|
|
43
|
+
if not job_id:
|
|
44
|
+
raise HyperbrowserError("Failed to start batch scrape job")
|
|
45
|
+
|
|
46
|
+
job_response: BatchScrapeJobResponse
|
|
47
|
+
failures = 0
|
|
48
|
+
while True:
|
|
49
|
+
try:
|
|
50
|
+
job_response = await self.get(
|
|
51
|
+
job_id, params=GetBatchScrapeJobParams(batch_size=1)
|
|
52
|
+
)
|
|
53
|
+
if (
|
|
54
|
+
job_response.status == "completed"
|
|
55
|
+
or job_response.status == "failed"
|
|
56
|
+
):
|
|
57
|
+
break
|
|
58
|
+
failures = 0
|
|
59
|
+
except Exception as e:
|
|
60
|
+
failures += 1
|
|
61
|
+
if failures >= POLLING_ATTEMPTS:
|
|
62
|
+
raise HyperbrowserError(
|
|
63
|
+
f"Failed to poll batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
64
|
+
)
|
|
65
|
+
await asyncio.sleep(2)
|
|
66
|
+
|
|
67
|
+
failures = 0
|
|
68
|
+
if not return_all_pages:
|
|
69
|
+
while True:
|
|
70
|
+
try:
|
|
71
|
+
job_response = await self.get(job_id)
|
|
72
|
+
return job_response
|
|
73
|
+
except Exception as e:
|
|
74
|
+
failures += 1
|
|
75
|
+
if failures >= POLLING_ATTEMPTS:
|
|
76
|
+
raise HyperbrowserError(
|
|
77
|
+
f"Failed to get batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
78
|
+
)
|
|
79
|
+
await asyncio.sleep(0.5)
|
|
80
|
+
|
|
81
|
+
failures = 0
|
|
82
|
+
job_response.current_page_batch = 0
|
|
83
|
+
job_response.data = []
|
|
84
|
+
while job_response.current_page_batch < job_response.total_page_batches:
|
|
85
|
+
try:
|
|
86
|
+
tmp_job_response = await self.get(
|
|
87
|
+
job_id,
|
|
88
|
+
params=GetBatchScrapeJobParams(
|
|
89
|
+
page=job_response.current_page_batch + 1, batch_size=100
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
if tmp_job_response.data:
|
|
93
|
+
job_response.data.extend(tmp_job_response.data)
|
|
94
|
+
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
95
|
+
job_response.total_scraped_pages = tmp_job_response.total_scraped_pages
|
|
96
|
+
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
97
|
+
job_response.batch_size = tmp_job_response.batch_size
|
|
98
|
+
failures = 0
|
|
99
|
+
except Exception as e:
|
|
100
|
+
failures += 1
|
|
101
|
+
if failures >= POLLING_ATTEMPTS:
|
|
102
|
+
raise HyperbrowserError(
|
|
103
|
+
f"Failed to get batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
104
|
+
)
|
|
105
|
+
await asyncio.sleep(0.5)
|
|
106
|
+
|
|
107
|
+
return job_response
|
|
108
|
+
|
|
109
|
+
|
|
11
110
|
class ScrapeManager:
|
|
12
111
|
def __init__(self, client):
|
|
13
112
|
self._client = client
|
|
113
|
+
self.batch = BatchScrapeManager(client)
|
|
14
114
|
|
|
15
115
|
async def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
|
|
16
116
|
response = await self._client.transport.post(
|
|
@@ -27,10 +127,24 @@ class ScrapeManager:
|
|
|
27
127
|
|
|
28
128
|
async def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
|
|
29
129
|
job_start_resp = await self.start(params)
|
|
30
|
-
|
|
130
|
+
job_id = job_start_resp.job_id
|
|
131
|
+
if not job_id:
|
|
31
132
|
raise HyperbrowserError("Failed to start scrape job")
|
|
133
|
+
|
|
134
|
+
failures = 0
|
|
32
135
|
while True:
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
136
|
+
try:
|
|
137
|
+
job_response = await self.get(job_id)
|
|
138
|
+
if (
|
|
139
|
+
job_response.status == "completed"
|
|
140
|
+
or job_response.status == "failed"
|
|
141
|
+
):
|
|
142
|
+
return job_response
|
|
143
|
+
failures = 0
|
|
144
|
+
except Exception as e:
|
|
145
|
+
failures += 1
|
|
146
|
+
if failures >= POLLING_ATTEMPTS:
|
|
147
|
+
raise HyperbrowserError(
|
|
148
|
+
f"Failed to poll scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
149
|
+
)
|
|
36
150
|
await asyncio.sleep(2)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import time
|
|
2
2
|
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
3
5
|
from ....models.crawl import (
|
|
4
6
|
CrawlJobResponse,
|
|
5
7
|
GetCrawlJobParams,
|
|
@@ -32,29 +34,69 @@ class CrawlManager:
|
|
|
32
34
|
self, params: StartCrawlJobParams, return_all_pages: bool = True
|
|
33
35
|
) -> CrawlJobResponse:
|
|
34
36
|
job_start_resp = self.start(params)
|
|
35
|
-
|
|
37
|
+
job_id = job_start_resp.job_id
|
|
38
|
+
if not job_id:
|
|
36
39
|
raise HyperbrowserError("Failed to start crawl job")
|
|
37
40
|
|
|
38
41
|
job_response: CrawlJobResponse
|
|
42
|
+
failures = 0
|
|
39
43
|
while True:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
44
|
+
try:
|
|
45
|
+
job_response = self.get(
|
|
46
|
+
job_id,
|
|
47
|
+
params=GetCrawlJobParams(batch_size=1),
|
|
48
|
+
)
|
|
49
|
+
if (
|
|
50
|
+
job_response.status == "completed"
|
|
51
|
+
or job_response.status == "failed"
|
|
52
|
+
):
|
|
53
|
+
break
|
|
54
|
+
except Exception as e:
|
|
55
|
+
failures += 1
|
|
56
|
+
if failures >= POLLING_ATTEMPTS:
|
|
57
|
+
raise HyperbrowserError(
|
|
58
|
+
f"Failed to poll crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
59
|
+
)
|
|
43
60
|
time.sleep(2)
|
|
44
61
|
|
|
62
|
+
failures = 0
|
|
45
63
|
if not return_all_pages:
|
|
46
|
-
|
|
64
|
+
while True:
|
|
65
|
+
try:
|
|
66
|
+
job_response = self.get(job_id)
|
|
67
|
+
return job_response
|
|
68
|
+
except Exception as e:
|
|
69
|
+
failures += 1
|
|
70
|
+
if failures >= POLLING_ATTEMPTS:
|
|
71
|
+
raise HyperbrowserError(
|
|
72
|
+
f"Failed to get crawl job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
73
|
+
)
|
|
74
|
+
time.sleep(0.5)
|
|
47
75
|
|
|
76
|
+
failures = 0
|
|
77
|
+
job_response.current_page_batch = 0
|
|
78
|
+
job_response.data = []
|
|
48
79
|
while job_response.current_page_batch < job_response.total_page_batches:
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
80
|
+
try:
|
|
81
|
+
tmp_job_response = self.get(
|
|
82
|
+
job_id,
|
|
83
|
+
GetCrawlJobParams(
|
|
84
|
+
page=job_response.current_page_batch + 1, batch_size=100
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
if tmp_job_response.data:
|
|
88
|
+
job_response.data.extend(tmp_job_response.data)
|
|
89
|
+
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
90
|
+
job_response.total_crawled_pages = tmp_job_response.total_crawled_pages
|
|
91
|
+
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
92
|
+
job_response.batch_size = tmp_job_response.batch_size
|
|
93
|
+
failures = 0
|
|
94
|
+
except Exception as e:
|
|
95
|
+
failures += 1
|
|
96
|
+
if failures >= POLLING_ATTEMPTS:
|
|
97
|
+
raise HyperbrowserError(
|
|
98
|
+
f"Failed to get crawl batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
99
|
+
)
|
|
59
100
|
time.sleep(0.5)
|
|
101
|
+
|
|
60
102
|
return job_response
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import time
|
|
2
2
|
from hyperbrowser.exceptions import HyperbrowserError
|
|
3
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
3
4
|
from hyperbrowser.models.extract import (
|
|
4
5
|
ExtractJobResponse,
|
|
5
6
|
StartExtractJobParams,
|
|
@@ -32,10 +33,23 @@ class ExtractManager:
|
|
|
32
33
|
|
|
33
34
|
def start_and_wait(self, params: StartExtractJobParams) -> ExtractJobResponse:
|
|
34
35
|
job_start_resp = self.start(params)
|
|
35
|
-
|
|
36
|
+
job_id = job_start_resp.job_id
|
|
37
|
+
if not job_id:
|
|
36
38
|
raise HyperbrowserError("Failed to start extract job")
|
|
39
|
+
|
|
40
|
+
failures = 0
|
|
37
41
|
while True:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
try:
|
|
43
|
+
job_response = self.get(job_start_resp.job_id)
|
|
44
|
+
if (
|
|
45
|
+
job_response.status == "completed"
|
|
46
|
+
or job_response.status == "failed"
|
|
47
|
+
):
|
|
48
|
+
return job_response
|
|
49
|
+
except Exception as e:
|
|
50
|
+
failures += 1
|
|
51
|
+
if failures >= POLLING_ATTEMPTS:
|
|
52
|
+
raise HyperbrowserError(
|
|
53
|
+
f"Failed to poll extract job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
54
|
+
)
|
|
41
55
|
time.sleep(2)
|
|
@@ -1,16 +1,114 @@
|
|
|
1
1
|
import time
|
|
2
2
|
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from hyperbrowser.models.consts import POLLING_ATTEMPTS
|
|
3
5
|
from ....models.scrape import (
|
|
6
|
+
BatchScrapeJobResponse,
|
|
7
|
+
GetBatchScrapeJobParams,
|
|
4
8
|
ScrapeJobResponse,
|
|
9
|
+
StartBatchScrapeJobParams,
|
|
10
|
+
StartBatchScrapeJobResponse,
|
|
5
11
|
StartScrapeJobParams,
|
|
6
12
|
StartScrapeJobResponse,
|
|
7
13
|
)
|
|
8
14
|
from ....exceptions import HyperbrowserError
|
|
9
15
|
|
|
10
16
|
|
|
17
|
+
class BatchScrapeManager:
|
|
18
|
+
def __init__(self, client):
|
|
19
|
+
self._client = client
|
|
20
|
+
|
|
21
|
+
def start(self, params: StartBatchScrapeJobParams) -> StartBatchScrapeJobResponse:
|
|
22
|
+
response = self._client.transport.post(
|
|
23
|
+
self._client._build_url("/scrape/batch"),
|
|
24
|
+
data=params.model_dump(exclude_none=True, by_alias=True),
|
|
25
|
+
)
|
|
26
|
+
return StartBatchScrapeJobResponse(**response.data)
|
|
27
|
+
|
|
28
|
+
def get(
|
|
29
|
+
self, job_id: str, params: GetBatchScrapeJobParams = GetBatchScrapeJobParams()
|
|
30
|
+
) -> BatchScrapeJobResponse:
|
|
31
|
+
response = self._client.transport.get(
|
|
32
|
+
self._client._build_url(f"/scrape/batch/{job_id}"), params=params.__dict__
|
|
33
|
+
)
|
|
34
|
+
return BatchScrapeJobResponse(**response.data)
|
|
35
|
+
|
|
36
|
+
def start_and_wait(
|
|
37
|
+
self, params: StartBatchScrapeJobParams, return_all_pages: bool = True
|
|
38
|
+
) -> BatchScrapeJobResponse:
|
|
39
|
+
job_start_resp = self.start(params)
|
|
40
|
+
job_id = job_start_resp.job_id
|
|
41
|
+
if not job_id:
|
|
42
|
+
raise HyperbrowserError("Failed to start batch scrape job")
|
|
43
|
+
|
|
44
|
+
job_response: BatchScrapeJobResponse
|
|
45
|
+
failures = 0
|
|
46
|
+
while True:
|
|
47
|
+
try:
|
|
48
|
+
job_response = self.get(
|
|
49
|
+
job_id, params=GetBatchScrapeJobParams(batch_size=1)
|
|
50
|
+
)
|
|
51
|
+
if (
|
|
52
|
+
job_response.status == "completed"
|
|
53
|
+
or job_response.status == "failed"
|
|
54
|
+
):
|
|
55
|
+
break
|
|
56
|
+
failures = 0
|
|
57
|
+
except Exception as e:
|
|
58
|
+
failures += 1
|
|
59
|
+
if failures >= POLLING_ATTEMPTS:
|
|
60
|
+
raise HyperbrowserError(
|
|
61
|
+
f"Failed to poll batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
62
|
+
)
|
|
63
|
+
time.sleep(2)
|
|
64
|
+
|
|
65
|
+
failures = 0
|
|
66
|
+
if not return_all_pages:
|
|
67
|
+
while True:
|
|
68
|
+
try:
|
|
69
|
+
job_response = self.get(job_id)
|
|
70
|
+
return job_response
|
|
71
|
+
except Exception as e:
|
|
72
|
+
failures += 1
|
|
73
|
+
if failures >= POLLING_ATTEMPTS:
|
|
74
|
+
raise HyperbrowserError(
|
|
75
|
+
f"Failed to get batch scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
76
|
+
)
|
|
77
|
+
time.sleep(0.5)
|
|
78
|
+
|
|
79
|
+
failures = 0
|
|
80
|
+
job_response.current_page_batch = 0
|
|
81
|
+
job_response.data = []
|
|
82
|
+
while job_response.current_page_batch < job_response.total_page_batches:
|
|
83
|
+
try:
|
|
84
|
+
tmp_job_response = self.get(
|
|
85
|
+
job_start_resp.job_id,
|
|
86
|
+
GetBatchScrapeJobParams(
|
|
87
|
+
page=job_response.current_page_batch + 1, batch_size=100
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
if tmp_job_response.data:
|
|
91
|
+
job_response.data.extend(tmp_job_response.data)
|
|
92
|
+
job_response.current_page_batch = tmp_job_response.current_page_batch
|
|
93
|
+
job_response.total_scraped_pages = tmp_job_response.total_scraped_pages
|
|
94
|
+
job_response.total_page_batches = tmp_job_response.total_page_batches
|
|
95
|
+
job_response.batch_size = tmp_job_response.batch_size
|
|
96
|
+
failures = 0
|
|
97
|
+
except Exception as e:
|
|
98
|
+
failures += 1
|
|
99
|
+
if failures >= POLLING_ATTEMPTS:
|
|
100
|
+
raise HyperbrowserError(
|
|
101
|
+
f"Failed to get batch page {job_response.current_page_batch} for job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
102
|
+
)
|
|
103
|
+
time.sleep(0.5)
|
|
104
|
+
|
|
105
|
+
return job_response
|
|
106
|
+
|
|
107
|
+
|
|
11
108
|
class ScrapeManager:
|
|
12
109
|
def __init__(self, client):
|
|
13
110
|
self._client = client
|
|
111
|
+
self.batch = BatchScrapeManager(client)
|
|
14
112
|
|
|
15
113
|
def start(self, params: StartScrapeJobParams) -> StartScrapeJobResponse:
|
|
16
114
|
response = self._client.transport.post(
|
|
@@ -27,10 +125,24 @@ class ScrapeManager:
|
|
|
27
125
|
|
|
28
126
|
def start_and_wait(self, params: StartScrapeJobParams) -> ScrapeJobResponse:
|
|
29
127
|
job_start_resp = self.start(params)
|
|
30
|
-
|
|
128
|
+
job_id = job_start_resp.job_id
|
|
129
|
+
if not job_id:
|
|
31
130
|
raise HyperbrowserError("Failed to start scrape job")
|
|
131
|
+
|
|
132
|
+
failures = 0
|
|
32
133
|
while True:
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
134
|
+
try:
|
|
135
|
+
job_response = self.get(job_id)
|
|
136
|
+
if (
|
|
137
|
+
job_response.status == "completed"
|
|
138
|
+
or job_response.status == "failed"
|
|
139
|
+
):
|
|
140
|
+
return job_response
|
|
141
|
+
failures = 0
|
|
142
|
+
except Exception as e:
|
|
143
|
+
failures += 1
|
|
144
|
+
if failures >= POLLING_ATTEMPTS:
|
|
145
|
+
raise HyperbrowserError(
|
|
146
|
+
f"Failed to poll scrape job {job_id} after {POLLING_ATTEMPTS} attempts: {e}"
|
|
147
|
+
)
|
|
36
148
|
time.sleep(2)
|
hyperbrowser/models/consts.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
from typing import Literal
|
|
2
2
|
|
|
3
3
|
ScrapeFormat = Literal["markdown", "html", "links", "screenshot"]
|
|
4
|
+
ScrapeWaitUntil = Literal["load", "domcontentloaded", "networkidle"]
|
|
5
|
+
ScrapePageStatus = Literal["completed", "failed", "pending", "running"]
|
|
6
|
+
POLLING_ATTEMPTS = 5
|
|
4
7
|
|
|
5
8
|
Country = Literal[
|
|
6
9
|
"AD",
|
hyperbrowser/models/crawl.py
CHANGED
|
@@ -18,7 +18,7 @@ class StartCrawlJobParams(BaseModel):
|
|
|
18
18
|
)
|
|
19
19
|
|
|
20
20
|
url: str
|
|
21
|
-
max_pages: int = Field(default=
|
|
21
|
+
max_pages: Optional[int] = Field(default=None, ge=1, serialization_alias="maxPages")
|
|
22
22
|
follow_links: bool = Field(default=True, serialization_alias="followLinks")
|
|
23
23
|
ignore_sitemap: bool = Field(default=False, serialization_alias="ignoreSitemap")
|
|
24
24
|
exclude_patterns: List[str] = Field(
|
|
@@ -69,7 +69,7 @@ class GetCrawlJobParams(BaseModel):
|
|
|
69
69
|
|
|
70
70
|
page: Optional[int] = Field(default=None, serialization_alias="page")
|
|
71
71
|
batch_size: Optional[int] = Field(
|
|
72
|
-
default=
|
|
72
|
+
default=None, ge=1, serialization_alias="batchSize"
|
|
73
73
|
)
|
|
74
74
|
|
|
75
75
|
|
hyperbrowser/models/extract.py
CHANGED
|
@@ -17,13 +17,17 @@ class StartExtractJobParams(BaseModel):
|
|
|
17
17
|
)
|
|
18
18
|
|
|
19
19
|
urls: List[str]
|
|
20
|
-
|
|
20
|
+
system_prompt: Optional[str] = Field(
|
|
21
|
+
default=None, serialization_alias="systemPrompt"
|
|
22
|
+
)
|
|
23
|
+
prompt: Optional[str] = Field(default=None, serialization_alias="prompt")
|
|
21
24
|
schema_: Optional[Any] = pydantic.Field(
|
|
22
25
|
None, alias="schema", serialization_alias="schema"
|
|
23
26
|
)
|
|
24
27
|
session_options: Optional[CreateSessionParams] = Field(
|
|
25
28
|
default=None, serialization_alias="sessionOptions"
|
|
26
29
|
)
|
|
30
|
+
max_links: Optional[int] = Field(default=None, serialization_alias="maxLinks")
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
class StartExtractJobResponse(BaseModel):
|
hyperbrowser/models/scrape.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List, Literal, Optional, Union
|
|
2
2
|
from pydantic import BaseModel, ConfigDict, Field
|
|
3
3
|
|
|
4
|
-
from hyperbrowser.models.consts import ScrapeFormat
|
|
4
|
+
from hyperbrowser.models.consts import ScrapeFormat, ScrapePageStatus, ScrapeWaitUntil
|
|
5
5
|
from hyperbrowser.models.session import CreateSessionParams
|
|
6
6
|
|
|
7
7
|
ScrapeJobStatus = Literal["pending", "running", "completed", "failed"]
|
|
@@ -24,6 +24,9 @@ class ScrapeOptions(BaseModel):
|
|
|
24
24
|
)
|
|
25
25
|
wait_for: Optional[int] = Field(default=None, serialization_alias="waitFor")
|
|
26
26
|
timeout: Optional[int] = Field(default=None, serialization_alias="timeout")
|
|
27
|
+
wait_until: Optional[ScrapeWaitUntil] = Field(
|
|
28
|
+
default=None, serialization_alias="waitUntil"
|
|
29
|
+
)
|
|
27
30
|
|
|
28
31
|
|
|
29
32
|
class StartScrapeJobParams(BaseModel):
|
|
@@ -81,3 +84,78 @@ class ScrapeJobResponse(BaseModel):
|
|
|
81
84
|
status: ScrapeJobStatus
|
|
82
85
|
error: Optional[str] = None
|
|
83
86
|
data: Optional[ScrapeJobData] = None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class StartBatchScrapeJobParams(BaseModel):
|
|
90
|
+
"""
|
|
91
|
+
Parameters for creating a new batch scrape job.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
urls: List[str]
|
|
95
|
+
session_options: Optional[CreateSessionParams] = Field(
|
|
96
|
+
default=None, serialization_alias="sessionOptions"
|
|
97
|
+
)
|
|
98
|
+
scrape_options: Optional[ScrapeOptions] = Field(
|
|
99
|
+
default=None, serialization_alias="scrapeOptions"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class ScrapedPage(BaseModel):
|
|
104
|
+
"""
|
|
105
|
+
A scraped page.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
model_config = ConfigDict(
|
|
109
|
+
populate_by_alias=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
url: str
|
|
113
|
+
status: ScrapePageStatus
|
|
114
|
+
error: Optional[str] = None
|
|
115
|
+
metadata: Optional[dict[str, Union[str, list[str]]]] = None
|
|
116
|
+
html: Optional[str] = None
|
|
117
|
+
markdown: Optional[str] = None
|
|
118
|
+
links: Optional[List[str]] = None
|
|
119
|
+
screenshot: Optional[str] = None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class GetBatchScrapeJobParams(BaseModel):
|
|
123
|
+
"""
|
|
124
|
+
Parameters for getting a batch scrape job.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
page: Optional[int] = Field(default=None, serialization_alias="page")
|
|
128
|
+
batch_size: Optional[int] = Field(
|
|
129
|
+
default=None, ge=1, serialization_alias="batchSize"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class StartBatchScrapeJobResponse(BaseModel):
|
|
134
|
+
"""
|
|
135
|
+
Response from starting a batch scrape job.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
model_config = ConfigDict(
|
|
139
|
+
populate_by_alias=True,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
job_id: str = Field(alias="jobId")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class BatchScrapeJobResponse(BaseModel):
|
|
146
|
+
"""
|
|
147
|
+
Response from getting a batch scrape job.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
model_config = ConfigDict(
|
|
151
|
+
populate_by_alias=True,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
job_id: str = Field(alias="jobId")
|
|
155
|
+
status: ScrapeJobStatus
|
|
156
|
+
error: Optional[str] = None
|
|
157
|
+
data: Optional[List[ScrapedPage]] = Field(alias="data")
|
|
158
|
+
total_scraped_pages: int = Field(alias="totalScrapedPages")
|
|
159
|
+
total_page_batches: int = Field(alias="totalPageBatches")
|
|
160
|
+
current_page_batch: int = Field(alias="currentPageBatch")
|
|
161
|
+
batch_size: int = Field(alias="batchSize")
|
|
@@ -1,27 +1,27 @@
|
|
|
1
1
|
hyperbrowser/__init__.py,sha256=zWGcLhqhvWy6BTwuNpzWK1-0LpIn311ks-4U9nrsb7Y,187
|
|
2
2
|
hyperbrowser/client/async_client.py,sha256=L7mbzg_wOVMneOm6-bA5XaBoVWUmybuRogE4YEMR5Bg,1389
|
|
3
3
|
hyperbrowser/client/base.py,sha256=9gFma7RdvJBUlDCqr8tZd315UPrjn4ldU4B0-Y-L4O4,1268
|
|
4
|
-
hyperbrowser/client/managers/async_manager/crawl.py,sha256=
|
|
4
|
+
hyperbrowser/client/managers/async_manager/crawl.py,sha256=n0KhHarVpPFwjjlc9UnhSfD5vvdC2kpcXHVr8vPiKxE,3870
|
|
5
5
|
hyperbrowser/client/managers/async_manager/extension.py,sha256=a-xYtXXdCspukYtsguRgjEoQ8E_kzzA2tQAJtIyCtAs,1439
|
|
6
|
-
hyperbrowser/client/managers/async_manager/extract.py,sha256=
|
|
6
|
+
hyperbrowser/client/managers/async_manager/extract.py,sha256=9p8dGLYmoow7smnQ0BTRN6diDmIayjW-EPRGzzvheZk,2102
|
|
7
7
|
hyperbrowser/client/managers/async_manager/profile.py,sha256=f2uX2GGYdgL0fyzB0jnI-L-nWleqG6cwZ0pc1K1zdQY,1244
|
|
8
|
-
hyperbrowser/client/managers/async_manager/scrape.py,sha256=
|
|
8
|
+
hyperbrowser/client/managers/async_manager/scrape.py,sha256=Lr6oicTX3U1xooczUVQlsq2mzIhAKEW-A9909ZdSzKc,5720
|
|
9
9
|
hyperbrowser/client/managers/async_manager/session.py,sha256=ObJhz1IkCCIQLwmztQ-M7lCKzKsVDr-eWCFnan2d9rQ,1692
|
|
10
|
-
hyperbrowser/client/managers/sync_manager/crawl.py,sha256=
|
|
10
|
+
hyperbrowser/client/managers/sync_manager/crawl.py,sha256=uAVmjhUbamVnzAAyfswq1bdBR5c7JrfGVvPdVmmw4R8,3799
|
|
11
11
|
hyperbrowser/client/managers/sync_manager/extension.py,sha256=1YoyTZtMo43trl9jAsXv95aor0nBHiJEmLva39jFW-k,1415
|
|
12
|
-
hyperbrowser/client/managers/sync_manager/extract.py,sha256=
|
|
12
|
+
hyperbrowser/client/managers/sync_manager/extract.py,sha256=XocMKC0qAarRpE12KU4m_mi1KhUOHp3-TK4dLeiIn6E,2034
|
|
13
13
|
hyperbrowser/client/managers/sync_manager/profile.py,sha256=va6mlhQ5SKZa-viEGFNzV6UBZEP5SqwVp32_oxC_NzM,1196
|
|
14
|
-
hyperbrowser/client/managers/sync_manager/scrape.py,sha256=
|
|
14
|
+
hyperbrowser/client/managers/sync_manager/scrape.py,sha256=y4YB-NusXRi3brE7jBGRBHGANY-_-aHMBirKuuU6mdg,5579
|
|
15
15
|
hyperbrowser/client/managers/sync_manager/session.py,sha256=74cekrDaGKW5WlP_0Qrqlk-xW2p1u4s63E-D08a4A2s,1610
|
|
16
16
|
hyperbrowser/client/sync.py,sha256=HgglJY9pNdW987OzNO_5dSZgj1AfAqovCmY99WYQD2E,1213
|
|
17
17
|
hyperbrowser/config.py,sha256=2J6GYNR_83vzJZ6jEV-LXO1U-q6DHIrfyAU0WrUPhw8,625
|
|
18
18
|
hyperbrowser/exceptions.py,sha256=SUUkptK2OL36xDORYmSicaTYR7pMbxeWAjAgz35xnM8,1171
|
|
19
|
-
hyperbrowser/models/consts.py,sha256=
|
|
20
|
-
hyperbrowser/models/crawl.py,sha256=
|
|
19
|
+
hyperbrowser/models/consts.py,sha256=L_6A8JhqLVuR-7p1gMGU6X-eF6KKRM0QIofc_J77Vgw,5146
|
|
20
|
+
hyperbrowser/models/crawl.py,sha256=22hP_DPZMfa2MAfOeJ90qj5CH4rr7VtQT1gCQqO8jO8,2610
|
|
21
21
|
hyperbrowser/models/extension.py,sha256=nXjKXKt9R7RxyZ4hd3EvfqZsEGy_ufh1r5j2mqCLykQ,804
|
|
22
|
-
hyperbrowser/models/extract.py,sha256=
|
|
22
|
+
hyperbrowser/models/extract.py,sha256=24rNo0BzczRPTb3JOcz_WHo5Lz3rBc1Z3-l4EhhroI0,1447
|
|
23
23
|
hyperbrowser/models/profile.py,sha256=KRb_LNxxW00AsD_thzzthFS51vInJawt1RcoNz4Q9i8,1322
|
|
24
|
-
hyperbrowser/models/scrape.py,sha256=
|
|
24
|
+
hyperbrowser/models/scrape.py,sha256=HAQJk8KOTcTb9NuD_106tlWlUj55SOhZ6j6vzoQbsZ4,4159
|
|
25
25
|
hyperbrowser/models/session.py,sha256=pBSXnwhiibx8iW7fuxm25p5YFrq1-i_wCQA7mWgFgM0,5294
|
|
26
26
|
hyperbrowser/tools/__init__.py,sha256=OUaTUM-kiigYmzfbpx3XQhzMK1xT1wd8cqXgR4znsAY,2021
|
|
27
27
|
hyperbrowser/tools/anthropic.py,sha256=5pEkJm1H-26GToTwXsDjo4GGqVy1hATws4Pg59mumow,1667
|
|
@@ -30,7 +30,7 @@ hyperbrowser/tools/schema.py,sha256=cR2MUX8TvUyN8TnCyeX0pccp4AmPjrdaKzuAXRThOJo,
|
|
|
30
30
|
hyperbrowser/transport/async_transport.py,sha256=MIPJvilvZWBPXLZ96c9OohuN6TN9DaaU0EnyleG3q6g,4017
|
|
31
31
|
hyperbrowser/transport/base.py,sha256=ildpMrDiM8nvrSGrH2LTOafmB17T7PQB_NQ1ODA378U,1703
|
|
32
32
|
hyperbrowser/transport/sync.py,sha256=ER844H_OCPCrnmbc58cuqphWTVvCZJQn7-D7ZenCr3Y,3311
|
|
33
|
-
hyperbrowser-0.
|
|
34
|
-
hyperbrowser-0.
|
|
35
|
-
hyperbrowser-0.
|
|
36
|
-
hyperbrowser-0.
|
|
33
|
+
hyperbrowser-0.24.0.dist-info/LICENSE,sha256=6rUGKlyKb_1ZAH7h7YITYAAUNFN3MNGGKCyfrw49NLE,1071
|
|
34
|
+
hyperbrowser-0.24.0.dist-info/METADATA,sha256=gyDz49SMWKu3O3XpnP7zfS3n0R9UPOUl_dA4iNz22Pg,3440
|
|
35
|
+
hyperbrowser-0.24.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
36
|
+
hyperbrowser-0.24.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|