regscale-cli 6.18.0.0__py3-none-any.whl → 6.19.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of regscale-cli might be problematic. Click here for more details.
- regscale/__init__.py +1 -1
- regscale/integrations/api_paginator.py +932 -0
- regscale/integrations/api_paginator_example.py +348 -0
- regscale/integrations/commercial/__init__.py +11 -10
- regscale/integrations/commercial/{qualys.py → qualys/__init__.py} +756 -105
- regscale/integrations/commercial/qualys/scanner.py +1051 -0
- regscale/integrations/commercial/qualys/variables.py +21 -0
- regscale/integrations/commercial/sicura/api.py +1 -0
- regscale/integrations/commercial/stigv2/click_commands.py +36 -8
- regscale/integrations/commercial/stigv2/stig_integration.py +63 -9
- regscale/integrations/commercial/tenablev2/__init__.py +9 -0
- regscale/integrations/commercial/tenablev2/authenticate.py +23 -2
- regscale/integrations/commercial/tenablev2/commands.py +779 -0
- regscale/integrations/commercial/tenablev2/jsonl_scanner.py +1999 -0
- regscale/integrations/commercial/tenablev2/sc_scanner.py +600 -0
- regscale/integrations/commercial/tenablev2/scanner.py +7 -5
- regscale/integrations/commercial/tenablev2/utils.py +21 -4
- regscale/integrations/commercial/tenablev2/variables.py +4 -0
- regscale/integrations/jsonl_scanner_integration.py +523 -142
- regscale/integrations/scanner_integration.py +102 -26
- regscale/integrations/transformer/__init__.py +17 -0
- regscale/integrations/transformer/data_transformer.py +445 -0
- regscale/integrations/transformer/mappings/__init__.py +8 -0
- regscale/integrations/variables.py +2 -0
- regscale/models/__init__.py +5 -2
- regscale/models/integration_models/cisa_kev_data.json +5 -5
- regscale/models/integration_models/synqly_models/capabilities.json +1 -1
- regscale/models/regscale_models/asset.py +5 -2
- regscale/models/regscale_models/file.py +5 -2
- regscale/regscale.py +3 -1
- {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/METADATA +1 -1
- {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/RECORD +44 -28
- tests/regscale/core/test_version.py +22 -0
- tests/regscale/integrations/__init__.py +0 -0
- tests/regscale/integrations/test_api_paginator.py +597 -0
- tests/regscale/integrations/test_integration_mapping.py +60 -0
- tests/regscale/integrations/test_issue_creation.py +317 -0
- tests/regscale/integrations/test_issue_due_date.py +46 -0
- tests/regscale/integrations/transformer/__init__.py +0 -0
- tests/regscale/integrations/transformer/test_data_transformer.py +850 -0
- regscale/integrations/commercial/tenablev2/click.py +0 -1641
- {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/LICENSE +0 -0
- {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/WHEEL +0 -0
- {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/entry_points.txt +0 -0
- {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,932 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
API Paginator for RegScale integrations.
|
|
5
|
+
|
|
6
|
+
This class provides a reusable way to fetch paginated API responses and optionally
|
|
7
|
+
write results to JSONL files for processing by scanner integrations.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import time
|
|
14
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
15
|
+
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
import requests
|
|
18
|
+
from requests.adapters import HTTPAdapter
|
|
19
|
+
from urllib3.util.retry import Retry
|
|
20
|
+
|
|
21
|
+
from regscale.exceptions import ValidationException
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger("regscale")
|
|
24
|
+
|
|
25
|
+
# Constants for common patterns and protocols
|
|
26
|
+
HTTPS_PREFIX = "https://" # NOSONAR
|
|
27
|
+
HTTP_PATTERN = "http://" # NOSONAR
|
|
28
|
+
ALLOWED_PAGINATION_TYPES = ["offset", "page", "token", "cursor", "custom"]
|
|
29
|
+
RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
30
|
+
ALLOWED_HTTP_METHODS = ["GET", "POST"]
|
|
31
|
+
DEFAULT_PAGE_SIZE = 100
|
|
32
|
+
WRITE_MODE = "w"
|
|
33
|
+
APPEND_MODE = "a"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ApiPaginator:
|
|
37
|
+
"""
|
|
38
|
+
A utility class to handle API pagination and write results to a JSONL file.
|
|
39
|
+
|
|
40
|
+
This class is designed to work with RESTful APIs that use common pagination patterns.
|
|
41
|
+
It can retrieve all pages of results and optionally write them to a file for further processing.
|
|
42
|
+
|
|
43
|
+
Supports various pagination methods:
|
|
44
|
+
- Offset/limit pagination
|
|
45
|
+
- Page/per_page pagination
|
|
46
|
+
- Token-based pagination
|
|
47
|
+
- Cursor-based pagination
|
|
48
|
+
|
|
49
|
+
Also includes error handling, rate limiting, and concurrent requests.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
base_url: str,
|
|
55
|
+
auth_headers: Dict[str, str],
|
|
56
|
+
output_file: Optional[str] = None,
|
|
57
|
+
page_size: int = DEFAULT_PAGE_SIZE,
|
|
58
|
+
max_pages: Optional[int] = None,
|
|
59
|
+
timeout: int = 30,
|
|
60
|
+
retry_attempts: int = 3,
|
|
61
|
+
retry_backoff_factor: float = 0.5,
|
|
62
|
+
throttle_rate: Optional[float] = None,
|
|
63
|
+
concurrent_requests: int = 1,
|
|
64
|
+
ssl_verify: bool = True,
|
|
65
|
+
):
|
|
66
|
+
"""
|
|
67
|
+
Initialize the API Paginator.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
base_url (str): The base URL for the API
|
|
71
|
+
auth_headers (Dict[str, str]): Authentication headers for the API
|
|
72
|
+
output_file (Optional[str]): Path to write results to (JSONL format)
|
|
73
|
+
page_size (int): Number of items per page to request
|
|
74
|
+
max_pages (Optional[int]): Maximum number of pages to retrieve (None for all)
|
|
75
|
+
timeout (int): Request timeout in seconds
|
|
76
|
+
retry_attempts (int): Number of times to retry failed requests
|
|
77
|
+
retry_backoff_factor (float): Backoff factor for retries
|
|
78
|
+
throttle_rate (Optional[float]): Seconds to wait between requests (rate limiting)
|
|
79
|
+
concurrent_requests (int): Number of concurrent requests to make
|
|
80
|
+
ssl_verify (bool): Whether to verify SSL certificates
|
|
81
|
+
"""
|
|
82
|
+
self.base_url = base_url.rstrip("/")
|
|
83
|
+
self.auth_headers = auth_headers
|
|
84
|
+
self.output_file = output_file
|
|
85
|
+
self.page_size = page_size
|
|
86
|
+
self.max_pages = max_pages
|
|
87
|
+
self.timeout = timeout
|
|
88
|
+
self.retry_attempts = retry_attempts
|
|
89
|
+
self.retry_backoff_factor = retry_backoff_factor
|
|
90
|
+
self.throttle_rate = throttle_rate
|
|
91
|
+
self.concurrent_requests = max(1, concurrent_requests)
|
|
92
|
+
self.ssl_verify = ssl_verify
|
|
93
|
+
|
|
94
|
+
# Initialize session with retry capability
|
|
95
|
+
self.session = self._create_session()
|
|
96
|
+
|
|
97
|
+
# Ensure output directory exists if file is specified
|
|
98
|
+
self._ensure_output_dir_exists()
|
|
99
|
+
|
|
100
|
+
def _ensure_output_dir_exists(self) -> None:
|
|
101
|
+
"""Ensure the output directory exists if output file is specified."""
|
|
102
|
+
if self.output_file:
|
|
103
|
+
output_dir = os.path.dirname(os.path.abspath(self.output_file))
|
|
104
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
105
|
+
|
|
106
|
+
def _create_session(self) -> requests.Session:
|
|
107
|
+
"""
|
|
108
|
+
Create a requests session with retry capability.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
requests.Session: Configured session object
|
|
112
|
+
"""
|
|
113
|
+
session = requests.Session()
|
|
114
|
+
|
|
115
|
+
# Configure retry strategy
|
|
116
|
+
retry_strategy = Retry(
|
|
117
|
+
total=self.retry_attempts,
|
|
118
|
+
backoff_factor=self.retry_backoff_factor,
|
|
119
|
+
status_forcelist=RETRY_STATUS_CODES,
|
|
120
|
+
allowed_methods=ALLOWED_HTTP_METHODS,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
124
|
+
session.mount(HTTPS_PREFIX, adapter)
|
|
125
|
+
|
|
126
|
+
# Only mount HTTP adapter if SSL verification is disabled (for internal/development use)
|
|
127
|
+
if not self.ssl_verify:
|
|
128
|
+
session.mount(HTTP_PATTERN, adapter)
|
|
129
|
+
logger.warning(
|
|
130
|
+
"HTTP protocol enabled due to disabled SSL verification. Not recommended for production use."
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Add default headers
|
|
134
|
+
session.headers.update(self.auth_headers)
|
|
135
|
+
|
|
136
|
+
return session
|
|
137
|
+
|
|
138
|
+
def _prepare_pagination_params(
|
|
139
|
+
self, pagination_type: str, params: Optional[Dict[str, Any]] = None
|
|
140
|
+
) -> Dict[str, Any]:
|
|
141
|
+
"""
|
|
142
|
+
Prepare pagination parameters based on pagination type.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
pagination_type (str): Type of pagination
|
|
146
|
+
params (Optional[Dict[str, Any]]): Existing parameters
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Dict[str, Any]: Updated parameters
|
|
150
|
+
"""
|
|
151
|
+
current_params = params.copy() if params else {}
|
|
152
|
+
|
|
153
|
+
if pagination_type == "offset":
|
|
154
|
+
current_params["limit"] = self.page_size
|
|
155
|
+
current_params["offset"] = 0
|
|
156
|
+
elif pagination_type == "page":
|
|
157
|
+
current_params["per_page"] = self.page_size
|
|
158
|
+
current_params["page"] = 1
|
|
159
|
+
|
|
160
|
+
return current_params
|
|
161
|
+
|
|
162
|
+
def _apply_throttling(self, page_count: int) -> None:
|
|
163
|
+
"""Apply throttling between requests if configured.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
page_count (int): Current page count
|
|
167
|
+
"""
|
|
168
|
+
if self.throttle_rate and page_count > 0:
|
|
169
|
+
time.sleep(self.throttle_rate)
|
|
170
|
+
|
|
171
|
+
def _make_request(
|
|
172
|
+
self, url: str, params: Dict[str, Any], request_method: str, post_data: Optional[Dict[str, Any]] = None
|
|
173
|
+
) -> Optional[Dict[str, Any]]:
|
|
174
|
+
"""
|
|
175
|
+
Make an HTTP request and handle errors.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
url (str): URL to request
|
|
179
|
+
params (Dict[str, Any]): Query parameters
|
|
180
|
+
request_method (str): HTTP method (GET/POST)
|
|
181
|
+
post_data (Optional[Dict[str, Any]]): Data for POST requests
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Optional[Dict[str, Any]]: Response data or None on error
|
|
185
|
+
"""
|
|
186
|
+
response = None
|
|
187
|
+
try:
|
|
188
|
+
if request_method.upper() == "GET":
|
|
189
|
+
response = self.session.get(url, params=params, timeout=self.timeout, verify=self.ssl_verify)
|
|
190
|
+
else: # POST
|
|
191
|
+
response = self.session.post(
|
|
192
|
+
url, params=params, json=post_data, timeout=self.timeout, verify=self.ssl_verify
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
response.raise_for_status()
|
|
196
|
+
return response.json()
|
|
197
|
+
except requests.RequestException as e:
|
|
198
|
+
logger.error(f"Request error: {str(e)}")
|
|
199
|
+
logger.debug(f"Response: {response.text if hasattr(response, 'text') else 'No response text'}")
|
|
200
|
+
return None
|
|
201
|
+
except ValueError as e:
|
|
202
|
+
logger.error(f"JSON parsing error: {str(e)}")
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
def _extract_data(self, result: Dict[str, Any], data_path: Optional[str]) -> Optional[List[Dict[str, Any]]]:
|
|
206
|
+
"""
|
|
207
|
+
Extract data from API response based on data path.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
result (Dict[str, Any]): API response
|
|
211
|
+
data_path (Optional[str]): Path to data in response
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Optional[List[Dict[str, Any]]]: Extracted data items or None
|
|
215
|
+
"""
|
|
216
|
+
if not result:
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
if data_path:
|
|
220
|
+
# Navigate the nested structure to find data
|
|
221
|
+
data = self._navigate_data_path(result, data_path)
|
|
222
|
+
if not data:
|
|
223
|
+
return None
|
|
224
|
+
else:
|
|
225
|
+
# Use the entire response if no path is specified
|
|
226
|
+
data = result
|
|
227
|
+
|
|
228
|
+
# Convert to list if it's not already
|
|
229
|
+
return data if isinstance(data, list) else [data]
|
|
230
|
+
|
|
231
|
+
def _navigate_data_path(self, data: Dict[str, Any], path: str) -> Any:
|
|
232
|
+
"""
|
|
233
|
+
Navigate a nested structure using a dot-separated path.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
data (Dict[str, Any]): The data structure to navigate
|
|
237
|
+
path (str): Dot-separated path
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Any: The value found or empty dict if not found
|
|
241
|
+
"""
|
|
242
|
+
result = data
|
|
243
|
+
for key in path.split("."):
|
|
244
|
+
result = result.get(key, {})
|
|
245
|
+
if not result and result != 0: # Handle 0 as a valid value
|
|
246
|
+
logger.warning(f"No data found at path '{path}' in response")
|
|
247
|
+
return None
|
|
248
|
+
return result
|
|
249
|
+
|
|
250
|
+
def _write_items_to_file(self, items: List[Dict[str, Any]], output_file: str, file_mode: str) -> None:
|
|
251
|
+
"""
|
|
252
|
+
Write items to JSONL file.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
items (List[Dict[str, Any]]): Items to write
|
|
256
|
+
output_file (str): Path to output file
|
|
257
|
+
file_mode (str): File mode (w/a)
|
|
258
|
+
"""
|
|
259
|
+
try:
|
|
260
|
+
with open(output_file, file_mode) as f:
|
|
261
|
+
for item in items:
|
|
262
|
+
f.write(json.dumps(item) + "\n")
|
|
263
|
+
except IOError as e:
|
|
264
|
+
logger.error(f"Error writing to file {output_file}: {str(e)}")
|
|
265
|
+
|
|
266
|
+
def _process_offset_pagination(
|
|
267
|
+
self, current_params: Dict[str, Any], items: List[Dict[str, Any]]
|
|
268
|
+
) -> Tuple[bool, Dict[str, Any]]:
|
|
269
|
+
"""Process offset-based pagination.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
current_params (Dict[str, Any]): Current parameters
|
|
273
|
+
items (List[Dict[str, Any]]): Current items
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Tuple[bool, Dict[str, Any]]: (has_more, updated_params)
|
|
277
|
+
"""
|
|
278
|
+
current_params["offset"] += self.page_size
|
|
279
|
+
# Auto-detect if we've reached the end
|
|
280
|
+
has_more = len(items) == self.page_size
|
|
281
|
+
return has_more, current_params
|
|
282
|
+
|
|
283
|
+
def _process_page_pagination(
|
|
284
|
+
self, current_params: Dict[str, Any], items: List[Dict[str, Any]]
|
|
285
|
+
) -> Tuple[bool, Dict[str, Any]]:
|
|
286
|
+
"""Process page-based pagination.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
current_params (Dict[str, Any]): Current parameters
|
|
290
|
+
items (List[Dict[str, Any]]): Current items
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Tuple[bool, Dict[str, Any]]: (has_more, updated_params)
|
|
294
|
+
"""
|
|
295
|
+
current_params["page"] += 1
|
|
296
|
+
# Auto-detect if we've reached the end
|
|
297
|
+
has_more = len(items) == self.page_size
|
|
298
|
+
return has_more, current_params
|
|
299
|
+
|
|
300
|
+
def _process_token_pagination(
|
|
301
|
+
self, result: Dict[str, Any], current_params: Dict[str, Any]
|
|
302
|
+
) -> Tuple[bool, Dict[str, Any]]:
|
|
303
|
+
"""Process token-based pagination.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
result (Dict[str, Any]): Current response
|
|
307
|
+
current_params (Dict[str, Any]): Current parameters
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Tuple[bool, Dict[str, Any]]: (has_more, updated_params)
|
|
311
|
+
"""
|
|
312
|
+
next_token = self._extract_next_token(result)
|
|
313
|
+
if next_token:
|
|
314
|
+
current_params["next_token"] = next_token
|
|
315
|
+
return True, current_params
|
|
316
|
+
return False, current_params
|
|
317
|
+
|
|
318
|
+
def _process_cursor_pagination(
|
|
319
|
+
self, result: Dict[str, Any], current_params: Dict[str, Any]
|
|
320
|
+
) -> Tuple[bool, Dict[str, Any]]:
|
|
321
|
+
"""Process cursor-based pagination.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
result (Dict[str, Any]): Current response
|
|
325
|
+
current_params (Dict[str, Any]): Current parameters
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Tuple[bool, Dict[str, Any]]: (has_more, updated_params)
|
|
329
|
+
"""
|
|
330
|
+
cursor = self._extract_cursor(result)
|
|
331
|
+
if cursor:
|
|
332
|
+
current_params["cursor"] = cursor
|
|
333
|
+
return True, current_params
|
|
334
|
+
return False, current_params
|
|
335
|
+
|
|
336
|
+
def _process_custom_pagination(
|
|
337
|
+
self,
|
|
338
|
+
result: Dict[str, Any],
|
|
339
|
+
current_params: Dict[str, Any],
|
|
340
|
+
next_page_extractor: Callable[[Dict[str, Any]], Optional[str]],
|
|
341
|
+
) -> Tuple[bool, str, Dict[str, Any]]:
|
|
342
|
+
"""Process custom pagination using extractor function.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
result (Dict[str, Any]): Current response
|
|
346
|
+
current_params (Dict[str, Any]): Current parameters
|
|
347
|
+
next_page_extractor (Callable): Function to extract next page
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
Tuple[bool, str, Dict[str, Any]]: (has_more, url, updated_params)
|
|
351
|
+
"""
|
|
352
|
+
next_page = next_page_extractor(result)
|
|
353
|
+
if not next_page:
|
|
354
|
+
return False, "", current_params
|
|
355
|
+
|
|
356
|
+
# Validate URL if it's a full URL
|
|
357
|
+
if next_page.startswith(HTTPS_PREFIX):
|
|
358
|
+
return True, next_page, current_params
|
|
359
|
+
elif next_page.startswith(HTTP_PATTERN) and not self.ssl_verify:
|
|
360
|
+
# Only allow HTTP URLs when SSL verification is disabled
|
|
361
|
+
logger.warning("Using insecure HTTP URL for pagination")
|
|
362
|
+
return True, next_page, current_params
|
|
363
|
+
else:
|
|
364
|
+
# Just a token or path
|
|
365
|
+
current_params["next"] = next_page
|
|
366
|
+
return True, "", current_params
|
|
367
|
+
|
|
368
|
+
def _process_next_page(
|
|
369
|
+
self,
|
|
370
|
+
pagination_type: str,
|
|
371
|
+
result: Dict[str, Any],
|
|
372
|
+
current_params: Dict[str, Any],
|
|
373
|
+
items: List[Dict[str, Any]],
|
|
374
|
+
next_page_extractor: Optional[Callable[[Dict[str, Any]], Optional[str]]] = None,
|
|
375
|
+
) -> Tuple[bool, str, Dict[str, Any]]:
|
|
376
|
+
"""
|
|
377
|
+
Process pagination for the next page.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
pagination_type (str): Type of pagination
|
|
381
|
+
result (Dict[str, Any]): Current page result
|
|
382
|
+
current_params (Dict[str, Any]): Current parameters
|
|
383
|
+
items (List[Dict[str, Any]]): Current page items
|
|
384
|
+
next_page_extractor (Optional[Callable]): Custom extractor function
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
Tuple[bool, str, Dict[str, Any]]: (has_more, url, updated_params)
|
|
388
|
+
"""
|
|
389
|
+
url = "" # Default empty URL (no change)
|
|
390
|
+
|
|
391
|
+
if pagination_type == "offset":
|
|
392
|
+
has_more, current_params = self._process_offset_pagination(current_params, items)
|
|
393
|
+
elif pagination_type == "page":
|
|
394
|
+
has_more, current_params = self._process_page_pagination(current_params, items)
|
|
395
|
+
elif pagination_type == "token":
|
|
396
|
+
has_more, current_params = self._process_token_pagination(result, current_params)
|
|
397
|
+
elif pagination_type == "cursor":
|
|
398
|
+
has_more, current_params = self._process_cursor_pagination(result, current_params)
|
|
399
|
+
elif pagination_type == "custom" and next_page_extractor:
|
|
400
|
+
has_more, url, current_params = self._process_custom_pagination(result, current_params, next_page_extractor)
|
|
401
|
+
else:
|
|
402
|
+
# Default - no more pages
|
|
403
|
+
has_more = False
|
|
404
|
+
|
|
405
|
+
return has_more, url, current_params
|
|
406
|
+
|
|
407
|
+
def _setup_pagination(
|
|
408
|
+
self, endpoint: str, pagination_type: str, params: Optional[Dict[str, Any]]
|
|
409
|
+
) -> Tuple[str, Dict[str, Any]]:
|
|
410
|
+
"""
|
|
411
|
+
Setup initial pagination state.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
endpoint (str): API endpoint
|
|
415
|
+
pagination_type (str): Type of pagination
|
|
416
|
+
params (Optional[Dict[str, Any]]): Query parameters
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
Tuple[str, Dict[str, Any]]: (url, current_params)
|
|
420
|
+
"""
|
|
421
|
+
# Validate pagination type
|
|
422
|
+
if pagination_type not in ALLOWED_PAGINATION_TYPES:
|
|
423
|
+
raise ValidationException(f"Invalid pagination type: {pagination_type}")
|
|
424
|
+
|
|
425
|
+
# Build full URL and prepare parameters
|
|
426
|
+
url = f"{self.base_url}/{endpoint.lstrip('/')}"
|
|
427
|
+
current_params = self._prepare_pagination_params(pagination_type, params)
|
|
428
|
+
|
|
429
|
+
return url, current_params
|
|
430
|
+
|
|
431
|
+
def _process_result_page(
|
|
432
|
+
self, result: Dict[str, Any], data_path: Optional[str], output_mode: Optional[str]
|
|
433
|
+
) -> Tuple[Optional[List[Dict[str, Any]]], Optional[str], int]:
|
|
434
|
+
"""
|
|
435
|
+
Process a page of results.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
result (Dict[str, Any]): API response
|
|
439
|
+
data_path (Optional[str]): Path to data
|
|
440
|
+
output_mode (Optional[str]): File mode for output
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
Tuple[Optional[List[Dict[str, Any]]], Optional[str], int]:
|
|
444
|
+
(items, new_output_mode, item_count)
|
|
445
|
+
"""
|
|
446
|
+
items = self._extract_data(result, data_path)
|
|
447
|
+
if not items:
|
|
448
|
+
return None, output_mode, 0
|
|
449
|
+
|
|
450
|
+
# Process items - either write to file or prepare to yield
|
|
451
|
+
if self.output_file and output_mode:
|
|
452
|
+
self._write_items_to_file(items, self.output_file, output_mode)
|
|
453
|
+
# Use append mode for subsequent pages
|
|
454
|
+
return items, APPEND_MODE, len(items)
|
|
455
|
+
|
|
456
|
+
# For streaming mode, items will be yielded by caller
|
|
457
|
+
return items, output_mode, len(items)
|
|
458
|
+
|
|
459
|
+
def _fetch_next_page(
|
|
460
|
+
self,
|
|
461
|
+
url: str,
|
|
462
|
+
current_params: Dict[str, Any],
|
|
463
|
+
request_method: str,
|
|
464
|
+
post_data: Optional[Dict[str, Any]] = None,
|
|
465
|
+
) -> Optional[Dict[str, Any]]:
|
|
466
|
+
"""
|
|
467
|
+
Fetch the next page of results.
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
url (str): URL to request
|
|
471
|
+
current_params (Dict[str, Any]): Current parameters
|
|
472
|
+
request_method (str): HTTP method
|
|
473
|
+
post_data (Optional[Dict[str, Any]]): Data for POST requests
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
Optional[Dict[str, Any]]: The API response or None on error
|
|
477
|
+
"""
|
|
478
|
+
return self._make_request(url, current_params, request_method, post_data)
|
|
479
|
+
|
|
480
|
+
def _yield_items(self, items: List[Dict[str, Any]]) -> Iterator[Dict[str, Any]]:
|
|
481
|
+
"""
|
|
482
|
+
Yield items to the caller in streaming mode.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
items (List[Dict[str, Any]]): Items to yield
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
Iterator[Dict[str, Any]]: Iterator of items
|
|
489
|
+
"""
|
|
490
|
+
for item in items:
|
|
491
|
+
yield item
|
|
492
|
+
|
|
493
|
+
def _should_continue_pagination(
|
|
494
|
+
self,
|
|
495
|
+
has_more: bool,
|
|
496
|
+
page_count: int,
|
|
497
|
+
) -> bool:
|
|
498
|
+
"""
|
|
499
|
+
Determine if pagination should continue.
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
has_more (bool): Whether there are more results
|
|
503
|
+
page_count (int): Current page count
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
bool: True if pagination should continue
|
|
507
|
+
"""
|
|
508
|
+
return has_more and (self.max_pages is None or page_count < self.max_pages)
|
|
509
|
+
|
|
510
|
+
def _log_pagination_progress(
|
|
511
|
+
self,
|
|
512
|
+
page_count: int,
|
|
513
|
+
item_count: int,
|
|
514
|
+
url: Optional[str] = None,
|
|
515
|
+
) -> None:
|
|
516
|
+
"""
|
|
517
|
+
Log pagination progress.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
page_count (int): Current page count
|
|
521
|
+
item_count (int): Item count for current page
|
|
522
|
+
url (Optional[str]): URL for current request, optional for final log
|
|
523
|
+
"""
|
|
524
|
+
if url:
|
|
525
|
+
logger.debug(f"Fetching page {page_count + 1} from {url}")
|
|
526
|
+
if item_count > 0:
|
|
527
|
+
logger.debug(f"Processed page {page_count} with {item_count} items")
|
|
528
|
+
|
|
529
|
+
def _log_pagination_complete(
|
|
530
|
+
self,
|
|
531
|
+
total_items: int,
|
|
532
|
+
page_count: int,
|
|
533
|
+
) -> None:
|
|
534
|
+
"""
|
|
535
|
+
Log completion of pagination.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
total_items (int): Total number of items fetched
|
|
539
|
+
page_count (int): Total number of pages
|
|
540
|
+
"""
|
|
541
|
+
logger.info(f"Completed pagination: {total_items} items in {page_count} pages")
|
|
542
|
+
|
|
543
|
+
def _process_single_page(
|
|
544
|
+
self,
|
|
545
|
+
url: str,
|
|
546
|
+
current_params: Dict[str, Any],
|
|
547
|
+
request_method: str,
|
|
548
|
+
post_data: Optional[Dict[str, Any]],
|
|
549
|
+
data_path: Optional[str],
|
|
550
|
+
pagination_type: str,
|
|
551
|
+
output_mode: Optional[str],
|
|
552
|
+
page_count: int,
|
|
553
|
+
next_page_extractor: Optional[Callable[[Dict[str, Any]], Optional[str]]] = None,
|
|
554
|
+
) -> Tuple[bool, str, Dict[str, Any], Optional[List[Dict[str, Any]]], Optional[str], int]:
|
|
555
|
+
"""
|
|
556
|
+
Process a single page of API results.
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
url (str): Current API URL
|
|
560
|
+
current_params (Dict[str, Any]): Current request parameters
|
|
561
|
+
request_method (str): HTTP method to use
|
|
562
|
+
post_data (Optional[Dict[str, Any]]): Data for POST requests
|
|
563
|
+
data_path (Optional[str]): Path to data in response
|
|
564
|
+
pagination_type (str): Type of pagination
|
|
565
|
+
output_mode (Optional[str]): Current file output mode
|
|
566
|
+
page_count (int): Current page counter
|
|
567
|
+
next_page_extractor (Optional[Callable]): Function to extract next page
|
|
568
|
+
|
|
569
|
+
Returns:
|
|
570
|
+
Tuple containing:
|
|
571
|
+
bool: Whether there are more pages
|
|
572
|
+
str: Next URL if applicable
|
|
573
|
+
Dict[str, Any]: Updated parameters
|
|
574
|
+
Optional[List[Dict[str, Any]]]: Items from this page
|
|
575
|
+
Optional[str]: Updated output mode
|
|
576
|
+
int: Number of items processed
|
|
577
|
+
"""
|
|
578
|
+
# Log beginning of page fetch
|
|
579
|
+
self._log_pagination_progress(page_count, 0, url)
|
|
580
|
+
|
|
581
|
+
# Apply throttling if needed
|
|
582
|
+
self._apply_throttling(page_count)
|
|
583
|
+
|
|
584
|
+
# Fetch the page
|
|
585
|
+
result = self._fetch_next_page(url, current_params, request_method, post_data)
|
|
586
|
+
if not result:
|
|
587
|
+
return False, url, current_params, None, output_mode, 0
|
|
588
|
+
|
|
589
|
+
# Process the results
|
|
590
|
+
items, new_output_mode, item_count = self._process_result_page(result, data_path, output_mode)
|
|
591
|
+
if not items:
|
|
592
|
+
return False, url, current_params, None, output_mode, 0
|
|
593
|
+
|
|
594
|
+
# Update pagination for next page
|
|
595
|
+
has_more, next_url, updated_params = self._process_next_page(
|
|
596
|
+
pagination_type, result, current_params, items, next_page_extractor
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
# Log page processed
|
|
600
|
+
self._log_pagination_progress(page_count, item_count, url)
|
|
601
|
+
|
|
602
|
+
return has_more, next_url, updated_params, items, new_output_mode, item_count
|
|
603
|
+
|
|
604
|
+
def fetch_paginated_results(
|
|
605
|
+
self,
|
|
606
|
+
endpoint: str,
|
|
607
|
+
params: Optional[Dict[str, Any]] = None,
|
|
608
|
+
data_path: Optional[str] = None,
|
|
609
|
+
pagination_type: str = "offset",
|
|
610
|
+
next_page_extractor: Optional[Callable[[Dict[str, Any]], Optional[str]]] = None,
|
|
611
|
+
request_method: str = "GET",
|
|
612
|
+
post_data: Optional[Dict[str, Any]] = None,
|
|
613
|
+
) -> Iterator[Dict[str, Any]]:
|
|
614
|
+
"""
|
|
615
|
+
Fetch all pages of results from the API endpoint.
|
|
616
|
+
|
|
617
|
+
Args:
|
|
618
|
+
endpoint (str): API endpoint path (will be appended to base_url)
|
|
619
|
+
params (Optional[Dict[str, Any]]): Query parameters for the request
|
|
620
|
+
data_path (Optional[str]): JSON path to the data array within the response
|
|
621
|
+
pagination_type (str): Type of pagination: "offset", "page", "token", or "cursor"
|
|
622
|
+
next_page_extractor (Optional[Callable]): Function to extract next page URL/token
|
|
623
|
+
request_method (str): HTTP method to use ("GET" or "POST")
|
|
624
|
+
post_data (Optional[Dict[str, Any]]): JSON data to send with POST requests
|
|
625
|
+
|
|
626
|
+
Returns:
|
|
627
|
+
Iterator[Dict[str, Any]]: Iterator yielding each result item
|
|
628
|
+
|
|
629
|
+
Raises:
|
|
630
|
+
ValidationException: If an invalid pagination type is provided
|
|
631
|
+
"""
|
|
632
|
+
# Setup initial pagination state
|
|
633
|
+
url, current_params = self._setup_pagination(endpoint, pagination_type, params)
|
|
634
|
+
current_post_data = post_data.copy() if post_data else {}
|
|
635
|
+
page_count = 0
|
|
636
|
+
total_items = 0
|
|
637
|
+
has_more = True
|
|
638
|
+
|
|
639
|
+
# Use file or memory for storing results
|
|
640
|
+
output_mode = WRITE_MODE if self.output_file else None
|
|
641
|
+
|
|
642
|
+
# Main pagination loop
|
|
643
|
+
while self._should_continue_pagination(has_more, page_count):
|
|
644
|
+
# Process a single page
|
|
645
|
+
has_more, next_url, current_params, items, output_mode, item_count = self._process_single_page(
|
|
646
|
+
url=url,
|
|
647
|
+
current_params=current_params,
|
|
648
|
+
request_method=request_method,
|
|
649
|
+
post_data=current_post_data,
|
|
650
|
+
data_path=data_path,
|
|
651
|
+
pagination_type=pagination_type,
|
|
652
|
+
output_mode=output_mode,
|
|
653
|
+
page_count=page_count,
|
|
654
|
+
next_page_extractor=next_page_extractor,
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# If no items processed, we're done
|
|
658
|
+
if not items or item_count == 0:
|
|
659
|
+
break
|
|
660
|
+
|
|
661
|
+
# In streaming mode, yield items directly
|
|
662
|
+
if not self.output_file:
|
|
663
|
+
for item in items:
|
|
664
|
+
yield item
|
|
665
|
+
|
|
666
|
+
# Update URL if changed
|
|
667
|
+
if next_url:
|
|
668
|
+
url = next_url
|
|
669
|
+
|
|
670
|
+
# Update counters
|
|
671
|
+
total_items += item_count
|
|
672
|
+
page_count += 1
|
|
673
|
+
|
|
674
|
+
# Log completion
|
|
675
|
+
self._log_pagination_complete(total_items, page_count)
|
|
676
|
+
|
|
677
|
+
# If writing to file, read back as iterator
|
|
678
|
+
if self.output_file:
|
|
679
|
+
yield from self.read_jsonl_file(self.output_file)
|
|
680
|
+
|
|
681
|
+
def _create_endpoint_fetch_task(
|
|
682
|
+
self, endpoint: str, params: Optional[Dict[str, Any]], data_path: Optional[str], request_method: str
|
|
683
|
+
) -> Callable[[], List[Dict[str, Any]]]:
|
|
684
|
+
"""
|
|
685
|
+
Create a callable task for fetching a single endpoint.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
endpoint (str): API endpoint
|
|
689
|
+
params (Optional[Dict[str, Any]]): Query parameters
|
|
690
|
+
data_path (Optional[str]): Path to data
|
|
691
|
+
request_method (str): HTTP method
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
Callable[[], List[Dict[str, Any]]]: Task function
|
|
695
|
+
"""
|
|
696
|
+
|
|
697
|
+
def task() -> List[Dict[str, Any]]:
|
|
698
|
+
results = []
|
|
699
|
+
for item in self.fetch_paginated_results(
|
|
700
|
+
endpoint=endpoint,
|
|
701
|
+
params=params,
|
|
702
|
+
data_path=data_path,
|
|
703
|
+
request_method=request_method,
|
|
704
|
+
):
|
|
705
|
+
results.append(item)
|
|
706
|
+
return results
|
|
707
|
+
|
|
708
|
+
return task
|
|
709
|
+
|
|
710
|
+
def _process_concurrent_results(self, futures: List, use_output_file: Optional[str]) -> Iterator[Dict[str, Any]]:
|
|
711
|
+
"""
|
|
712
|
+
Process results from concurrent endpoint fetches.
|
|
713
|
+
|
|
714
|
+
Args:
|
|
715
|
+
futures (List): List of Future objects
|
|
716
|
+
use_output_file (Optional[str]): Output file path
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
Iterator[Dict[str, Any]]: Iterator of results
|
|
720
|
+
"""
|
|
721
|
+
file_mode = WRITE_MODE if use_output_file else None
|
|
722
|
+
|
|
723
|
+
for future in futures:
|
|
724
|
+
try:
|
|
725
|
+
results = future.result()
|
|
726
|
+
if not results:
|
|
727
|
+
continue
|
|
728
|
+
|
|
729
|
+
if use_output_file:
|
|
730
|
+
self._write_items_to_file(results, use_output_file, file_mode)
|
|
731
|
+
# Use append mode for subsequent endpoints
|
|
732
|
+
file_mode = APPEND_MODE
|
|
733
|
+
else:
|
|
734
|
+
for item in results:
|
|
735
|
+
yield item
|
|
736
|
+
except Exception as e:
|
|
737
|
+
logger.error(f"Error in concurrent fetch: {str(e)}")
|
|
738
|
+
|
|
739
|
+
def fetch_all_concurrent(
|
|
740
|
+
self,
|
|
741
|
+
endpoints: List[str],
|
|
742
|
+
params: Optional[Dict[str, Any]] = None,
|
|
743
|
+
data_path: Optional[str] = None,
|
|
744
|
+
request_method: str = "GET",
|
|
745
|
+
output_file: Optional[str] = None,
|
|
746
|
+
) -> Iterator[Dict[str, Any]]:
|
|
747
|
+
"""
|
|
748
|
+
Fetch multiple endpoints concurrently and combine results.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
endpoints (List[str]): List of API endpoint paths
|
|
752
|
+
params (Optional[Dict[str, Any]]): Query parameters for the requests
|
|
753
|
+
data_path (Optional[str]): JSON path to the data array within the response
|
|
754
|
+
request_method (str): HTTP method to use ("GET" or "POST")
|
|
755
|
+
output_file (Optional[str]): Override the instance output_file
|
|
756
|
+
|
|
757
|
+
Returns:
|
|
758
|
+
Iterator[Dict[str, Any]]: Iterator yielding each result item
|
|
759
|
+
"""
|
|
760
|
+
use_output_file = output_file or self.output_file
|
|
761
|
+
|
|
762
|
+
# Create tasks for each endpoint
|
|
763
|
+
tasks = [
|
|
764
|
+
self._create_endpoint_fetch_task(endpoint, params, data_path, request_method) for endpoint in endpoints
|
|
765
|
+
]
|
|
766
|
+
|
|
767
|
+
# Execute tasks concurrently
|
|
768
|
+
with ThreadPoolExecutor(max_workers=self.concurrent_requests) as executor:
|
|
769
|
+
# Start all fetch tasks
|
|
770
|
+
futures = [executor.submit(task) for task in tasks]
|
|
771
|
+
|
|
772
|
+
# Process results as they complete
|
|
773
|
+
yield from self._process_concurrent_results(futures, use_output_file)
|
|
774
|
+
|
|
775
|
+
# If we're writing to a file, read it back as an iterator
|
|
776
|
+
if use_output_file:
|
|
777
|
+
yield from self.read_jsonl_file(use_output_file)
|
|
778
|
+
|
|
779
|
+
def _extract_next_token(self, response: Dict[str, Any]) -> Optional[str]:
|
|
780
|
+
"""
|
|
781
|
+
Extract the next token from a response.
|
|
782
|
+
|
|
783
|
+
This method tries several common patterns for next token references.
|
|
784
|
+
|
|
785
|
+
Args:
|
|
786
|
+
response (Dict[str, Any]): The API response
|
|
787
|
+
|
|
788
|
+
Returns:
|
|
789
|
+
Optional[str]: The next token or None if not found
|
|
790
|
+
"""
|
|
791
|
+
# Try common patterns for next token
|
|
792
|
+
token_paths = [
|
|
793
|
+
["nextToken"],
|
|
794
|
+
["next_token"],
|
|
795
|
+
["pagination", "nextToken"],
|
|
796
|
+
["meta", "next_token"],
|
|
797
|
+
["paging", "next"],
|
|
798
|
+
["links", "next"],
|
|
799
|
+
]
|
|
800
|
+
|
|
801
|
+
return self._extract_value_from_paths(response, token_paths)
|
|
802
|
+
|
|
803
|
+
def _extract_cursor(self, response: Dict[str, Any]) -> Optional[str]:
|
|
804
|
+
"""
|
|
805
|
+
Extract the cursor from a response.
|
|
806
|
+
|
|
807
|
+
This method tries several common patterns for cursor references.
|
|
808
|
+
|
|
809
|
+
Args:
|
|
810
|
+
response (Dict[str, Any]): The API response
|
|
811
|
+
|
|
812
|
+
Returns:
|
|
813
|
+
Optional[str]: The cursor or None if not found
|
|
814
|
+
"""
|
|
815
|
+
# Try common patterns for cursor
|
|
816
|
+
cursor_paths = [
|
|
817
|
+
["cursor"],
|
|
818
|
+
["page", "cursor"],
|
|
819
|
+
["meta", "cursor"],
|
|
820
|
+
["paging", "cursors", "after"],
|
|
821
|
+
["pagination", "cursor"],
|
|
822
|
+
]
|
|
823
|
+
|
|
824
|
+
return self._extract_value_from_paths(response, cursor_paths)
|
|
825
|
+
|
|
826
|
+
def _extract_value_from_path(self, data: Dict[str, Any], path: List[str]) -> Optional[str]:
|
|
827
|
+
"""
|
|
828
|
+
Extract a value from a nested dictionary using a single path.
|
|
829
|
+
|
|
830
|
+
Args:
|
|
831
|
+
data (Dict[str, Any]): The dictionary to search
|
|
832
|
+
path (List[str]): Path to the value
|
|
833
|
+
|
|
834
|
+
Returns:
|
|
835
|
+
Optional[str]: The found value or None
|
|
836
|
+
"""
|
|
837
|
+
value = data
|
|
838
|
+
try:
|
|
839
|
+
for key in path:
|
|
840
|
+
if key in value:
|
|
841
|
+
value = value[key]
|
|
842
|
+
else:
|
|
843
|
+
return None
|
|
844
|
+
if value and isinstance(value, (str, int)):
|
|
845
|
+
return str(value)
|
|
846
|
+
except (KeyError, TypeError):
|
|
847
|
+
pass
|
|
848
|
+
return None
|
|
849
|
+
|
|
850
|
+
def _extract_value_from_paths(self, data: Dict[str, Any], paths: List[List[str]]) -> Optional[str]:
|
|
851
|
+
"""
|
|
852
|
+
Extract a value from a nested dictionary using multiple possible paths.
|
|
853
|
+
|
|
854
|
+
Args:
|
|
855
|
+
data (Dict[str, Any]): The dictionary to search
|
|
856
|
+
paths (List[List[str]]): List of possible path lists to the value
|
|
857
|
+
|
|
858
|
+
Returns:
|
|
859
|
+
Optional[str]: The found value or None
|
|
860
|
+
"""
|
|
861
|
+
for path in paths:
|
|
862
|
+
value = self._extract_value_from_path(data, path)
|
|
863
|
+
if value:
|
|
864
|
+
return value
|
|
865
|
+
return None
|
|
866
|
+
|
|
867
|
+
@staticmethod
|
|
868
|
+
def _parse_jsonl_line(line: str) -> Optional[Dict[str, Any]]:
|
|
869
|
+
"""
|
|
870
|
+
Parse a single JSONL line.
|
|
871
|
+
|
|
872
|
+
Args:
|
|
873
|
+
line (str): Line to parse
|
|
874
|
+
|
|
875
|
+
Returns:
|
|
876
|
+
Optional[Dict[str, Any]]: Parsed JSON or None on error
|
|
877
|
+
"""
|
|
878
|
+
line = line.strip()
|
|
879
|
+
if not line: # Skip empty lines
|
|
880
|
+
return None
|
|
881
|
+
|
|
882
|
+
try:
|
|
883
|
+
return json.loads(line)
|
|
884
|
+
except json.JSONDecodeError as e:
|
|
885
|
+
logger.error(f"Error parsing JSON line: {str(e)}")
|
|
886
|
+
logger.debug(f"Problematic line: {line}")
|
|
887
|
+
return None
|
|
888
|
+
|
|
889
|
+
@staticmethod
|
|
890
|
+
def read_jsonl_file(file_path: str) -> Iterator[Dict[str, Any]]:
|
|
891
|
+
"""
|
|
892
|
+
Read a JSONL file and yield each line as a parsed JSON object.
|
|
893
|
+
|
|
894
|
+
Args:
|
|
895
|
+
file_path (str): Path to the JSONL file
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
Iterator[Dict[str, Any]]: Iterator of parsed JSON objects
|
|
899
|
+
"""
|
|
900
|
+
try:
|
|
901
|
+
with open(file_path, "r") as f:
|
|
902
|
+
for line in f:
|
|
903
|
+
parsed = ApiPaginator._parse_jsonl_line(line)
|
|
904
|
+
if parsed:
|
|
905
|
+
yield parsed
|
|
906
|
+
except FileNotFoundError:
|
|
907
|
+
logger.error(f"File not found: {file_path}")
|
|
908
|
+
except IOError as e:
|
|
909
|
+
logger.error(f"IO error reading file {file_path}: {str(e)}")
|
|
910
|
+
|
|
911
|
+
def get_output_file_path(self) -> Optional[str]:
|
|
912
|
+
"""
|
|
913
|
+
Get the path to the output file.
|
|
914
|
+
|
|
915
|
+
Returns:
|
|
916
|
+
Optional[str]: Path to the output file or None if not set
|
|
917
|
+
"""
|
|
918
|
+
return self.output_file
|
|
919
|
+
|
|
920
|
+
def clear_output_file(self) -> None:
|
|
921
|
+
"""
|
|
922
|
+
Clear the output file if it exists.
|
|
923
|
+
"""
|
|
924
|
+
if not self.output_file:
|
|
925
|
+
return
|
|
926
|
+
|
|
927
|
+
if os.path.exists(self.output_file):
|
|
928
|
+
try:
|
|
929
|
+
os.remove(self.output_file)
|
|
930
|
+
logger.debug(f"Cleared output file: {self.output_file}")
|
|
931
|
+
except OSError as e:
|
|
932
|
+
logger.error(f"Error clearing output file {self.output_file}: {str(e)}")
|