regscale-cli 6.18.0.0__py3-none-any.whl → 6.19.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of regscale-cli might be problematic. Click here for more details.

Files changed (45) hide show
  1. regscale/__init__.py +1 -1
  2. regscale/integrations/api_paginator.py +932 -0
  3. regscale/integrations/api_paginator_example.py +348 -0
  4. regscale/integrations/commercial/__init__.py +11 -10
  5. regscale/integrations/commercial/{qualys.py → qualys/__init__.py} +756 -105
  6. regscale/integrations/commercial/qualys/scanner.py +1051 -0
  7. regscale/integrations/commercial/qualys/variables.py +21 -0
  8. regscale/integrations/commercial/sicura/api.py +1 -0
  9. regscale/integrations/commercial/stigv2/click_commands.py +36 -8
  10. regscale/integrations/commercial/stigv2/stig_integration.py +63 -9
  11. regscale/integrations/commercial/tenablev2/__init__.py +9 -0
  12. regscale/integrations/commercial/tenablev2/authenticate.py +23 -2
  13. regscale/integrations/commercial/tenablev2/commands.py +779 -0
  14. regscale/integrations/commercial/tenablev2/jsonl_scanner.py +1999 -0
  15. regscale/integrations/commercial/tenablev2/sc_scanner.py +600 -0
  16. regscale/integrations/commercial/tenablev2/scanner.py +7 -5
  17. regscale/integrations/commercial/tenablev2/utils.py +21 -4
  18. regscale/integrations/commercial/tenablev2/variables.py +4 -0
  19. regscale/integrations/jsonl_scanner_integration.py +523 -142
  20. regscale/integrations/scanner_integration.py +102 -26
  21. regscale/integrations/transformer/__init__.py +17 -0
  22. regscale/integrations/transformer/data_transformer.py +445 -0
  23. regscale/integrations/transformer/mappings/__init__.py +8 -0
  24. regscale/integrations/variables.py +2 -0
  25. regscale/models/__init__.py +5 -2
  26. regscale/models/integration_models/cisa_kev_data.json +5 -5
  27. regscale/models/integration_models/synqly_models/capabilities.json +1 -1
  28. regscale/models/regscale_models/asset.py +5 -2
  29. regscale/models/regscale_models/file.py +5 -2
  30. regscale/regscale.py +3 -1
  31. {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/METADATA +1 -1
  32. {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/RECORD +44 -28
  33. tests/regscale/core/test_version.py +22 -0
  34. tests/regscale/integrations/__init__.py +0 -0
  35. tests/regscale/integrations/test_api_paginator.py +597 -0
  36. tests/regscale/integrations/test_integration_mapping.py +60 -0
  37. tests/regscale/integrations/test_issue_creation.py +317 -0
  38. tests/regscale/integrations/test_issue_due_date.py +46 -0
  39. tests/regscale/integrations/transformer/__init__.py +0 -0
  40. tests/regscale/integrations/transformer/test_data_transformer.py +850 -0
  41. regscale/integrations/commercial/tenablev2/click.py +0 -1641
  42. {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/LICENSE +0 -0
  43. {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/WHEEL +0 -0
  44. {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/entry_points.txt +0 -0
  45. {regscale_cli-6.18.0.0.dist-info → regscale_cli-6.19.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,932 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ API Paginator for RegScale integrations.
5
+
6
+ This class provides a reusable way to fetch paginated API responses and optionally
7
+ write results to JSONL files for processing by scanner integrations.
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import os
13
+ import time
14
+ from concurrent.futures import ThreadPoolExecutor
15
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple
16
+
17
+ import requests
18
+ from requests.adapters import HTTPAdapter
19
+ from urllib3.util.retry import Retry
20
+
21
+ from regscale.exceptions import ValidationException
22
+
23
+ logger = logging.getLogger("regscale")
24
+
25
+ # Constants for common patterns and protocols
26
+ HTTPS_PREFIX = "https://" # NOSONAR
27
+ HTTP_PATTERN = "http://" # NOSONAR
28
+ ALLOWED_PAGINATION_TYPES = ["offset", "page", "token", "cursor", "custom"]
29
+ RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
30
+ ALLOWED_HTTP_METHODS = ["GET", "POST"]
31
+ DEFAULT_PAGE_SIZE = 100
32
+ WRITE_MODE = "w"
33
+ APPEND_MODE = "a"
34
+
35
+
36
+ class ApiPaginator:
37
+ """
38
+ A utility class to handle API pagination and write results to a JSONL file.
39
+
40
+ This class is designed to work with RESTful APIs that use common pagination patterns.
41
+ It can retrieve all pages of results and optionally write them to a file for further processing.
42
+
43
+ Supports various pagination methods:
44
+ - Offset/limit pagination
45
+ - Page/per_page pagination
46
+ - Token-based pagination
47
+ - Cursor-based pagination
48
+
49
+ Also includes error handling, rate limiting, and concurrent requests.
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ base_url: str,
55
+ auth_headers: Dict[str, str],
56
+ output_file: Optional[str] = None,
57
+ page_size: int = DEFAULT_PAGE_SIZE,
58
+ max_pages: Optional[int] = None,
59
+ timeout: int = 30,
60
+ retry_attempts: int = 3,
61
+ retry_backoff_factor: float = 0.5,
62
+ throttle_rate: Optional[float] = None,
63
+ concurrent_requests: int = 1,
64
+ ssl_verify: bool = True,
65
+ ):
66
+ """
67
+ Initialize the API Paginator.
68
+
69
+ Args:
70
+ base_url (str): The base URL for the API
71
+ auth_headers (Dict[str, str]): Authentication headers for the API
72
+ output_file (Optional[str]): Path to write results to (JSONL format)
73
+ page_size (int): Number of items per page to request
74
+ max_pages (Optional[int]): Maximum number of pages to retrieve (None for all)
75
+ timeout (int): Request timeout in seconds
76
+ retry_attempts (int): Number of times to retry failed requests
77
+ retry_backoff_factor (float): Backoff factor for retries
78
+ throttle_rate (Optional[float]): Seconds to wait between requests (rate limiting)
79
+ concurrent_requests (int): Number of concurrent requests to make
80
+ ssl_verify (bool): Whether to verify SSL certificates
81
+ """
82
+ self.base_url = base_url.rstrip("/")
83
+ self.auth_headers = auth_headers
84
+ self.output_file = output_file
85
+ self.page_size = page_size
86
+ self.max_pages = max_pages
87
+ self.timeout = timeout
88
+ self.retry_attempts = retry_attempts
89
+ self.retry_backoff_factor = retry_backoff_factor
90
+ self.throttle_rate = throttle_rate
91
+ self.concurrent_requests = max(1, concurrent_requests)
92
+ self.ssl_verify = ssl_verify
93
+
94
+ # Initialize session with retry capability
95
+ self.session = self._create_session()
96
+
97
+ # Ensure output directory exists if file is specified
98
+ self._ensure_output_dir_exists()
99
+
100
+ def _ensure_output_dir_exists(self) -> None:
101
+ """Ensure the output directory exists if output file is specified."""
102
+ if self.output_file:
103
+ output_dir = os.path.dirname(os.path.abspath(self.output_file))
104
+ os.makedirs(output_dir, exist_ok=True)
105
+
106
+ def _create_session(self) -> requests.Session:
107
+ """
108
+ Create a requests session with retry capability.
109
+
110
+ Returns:
111
+ requests.Session: Configured session object
112
+ """
113
+ session = requests.Session()
114
+
115
+ # Configure retry strategy
116
+ retry_strategy = Retry(
117
+ total=self.retry_attempts,
118
+ backoff_factor=self.retry_backoff_factor,
119
+ status_forcelist=RETRY_STATUS_CODES,
120
+ allowed_methods=ALLOWED_HTTP_METHODS,
121
+ )
122
+
123
+ adapter = HTTPAdapter(max_retries=retry_strategy)
124
+ session.mount(HTTPS_PREFIX, adapter)
125
+
126
+ # Only mount HTTP adapter if SSL verification is disabled (for internal/development use)
127
+ if not self.ssl_verify:
128
+ session.mount(HTTP_PATTERN, adapter)
129
+ logger.warning(
130
+ "HTTP protocol enabled due to disabled SSL verification. Not recommended for production use."
131
+ )
132
+
133
+ # Add default headers
134
+ session.headers.update(self.auth_headers)
135
+
136
+ return session
137
+
138
+ def _prepare_pagination_params(
139
+ self, pagination_type: str, params: Optional[Dict[str, Any]] = None
140
+ ) -> Dict[str, Any]:
141
+ """
142
+ Prepare pagination parameters based on pagination type.
143
+
144
+ Args:
145
+ pagination_type (str): Type of pagination
146
+ params (Optional[Dict[str, Any]]): Existing parameters
147
+
148
+ Returns:
149
+ Dict[str, Any]: Updated parameters
150
+ """
151
+ current_params = params.copy() if params else {}
152
+
153
+ if pagination_type == "offset":
154
+ current_params["limit"] = self.page_size
155
+ current_params["offset"] = 0
156
+ elif pagination_type == "page":
157
+ current_params["per_page"] = self.page_size
158
+ current_params["page"] = 1
159
+
160
+ return current_params
161
+
162
+ def _apply_throttling(self, page_count: int) -> None:
163
+ """Apply throttling between requests if configured.
164
+
165
+ Args:
166
+ page_count (int): Current page count
167
+ """
168
+ if self.throttle_rate and page_count > 0:
169
+ time.sleep(self.throttle_rate)
170
+
171
+ def _make_request(
172
+ self, url: str, params: Dict[str, Any], request_method: str, post_data: Optional[Dict[str, Any]] = None
173
+ ) -> Optional[Dict[str, Any]]:
174
+ """
175
+ Make an HTTP request and handle errors.
176
+
177
+ Args:
178
+ url (str): URL to request
179
+ params (Dict[str, Any]): Query parameters
180
+ request_method (str): HTTP method (GET/POST)
181
+ post_data (Optional[Dict[str, Any]]): Data for POST requests
182
+
183
+ Returns:
184
+ Optional[Dict[str, Any]]: Response data or None on error
185
+ """
186
+ response = None
187
+ try:
188
+ if request_method.upper() == "GET":
189
+ response = self.session.get(url, params=params, timeout=self.timeout, verify=self.ssl_verify)
190
+ else: # POST
191
+ response = self.session.post(
192
+ url, params=params, json=post_data, timeout=self.timeout, verify=self.ssl_verify
193
+ )
194
+
195
+ response.raise_for_status()
196
+ return response.json()
197
+ except requests.RequestException as e:
198
+ logger.error(f"Request error: {str(e)}")
199
+ logger.debug(f"Response: {response.text if hasattr(response, 'text') else 'No response text'}")
200
+ return None
201
+ except ValueError as e:
202
+ logger.error(f"JSON parsing error: {str(e)}")
203
+ return None
204
+
205
+ def _extract_data(self, result: Dict[str, Any], data_path: Optional[str]) -> Optional[List[Dict[str, Any]]]:
206
+ """
207
+ Extract data from API response based on data path.
208
+
209
+ Args:
210
+ result (Dict[str, Any]): API response
211
+ data_path (Optional[str]): Path to data in response
212
+
213
+ Returns:
214
+ Optional[List[Dict[str, Any]]]: Extracted data items or None
215
+ """
216
+ if not result:
217
+ return None
218
+
219
+ if data_path:
220
+ # Navigate the nested structure to find data
221
+ data = self._navigate_data_path(result, data_path)
222
+ if not data:
223
+ return None
224
+ else:
225
+ # Use the entire response if no path is specified
226
+ data = result
227
+
228
+ # Convert to list if it's not already
229
+ return data if isinstance(data, list) else [data]
230
+
231
+ def _navigate_data_path(self, data: Dict[str, Any], path: str) -> Any:
232
+ """
233
+ Navigate a nested structure using a dot-separated path.
234
+
235
+ Args:
236
+ data (Dict[str, Any]): The data structure to navigate
237
+ path (str): Dot-separated path
238
+
239
+ Returns:
240
+ Any: The value found or empty dict if not found
241
+ """
242
+ result = data
243
+ for key in path.split("."):
244
+ result = result.get(key, {})
245
+ if not result and result != 0: # Handle 0 as a valid value
246
+ logger.warning(f"No data found at path '{path}' in response")
247
+ return None
248
+ return result
249
+
250
+ def _write_items_to_file(self, items: List[Dict[str, Any]], output_file: str, file_mode: str) -> None:
251
+ """
252
+ Write items to JSONL file.
253
+
254
+ Args:
255
+ items (List[Dict[str, Any]]): Items to write
256
+ output_file (str): Path to output file
257
+ file_mode (str): File mode (w/a)
258
+ """
259
+ try:
260
+ with open(output_file, file_mode) as f:
261
+ for item in items:
262
+ f.write(json.dumps(item) + "\n")
263
+ except IOError as e:
264
+ logger.error(f"Error writing to file {output_file}: {str(e)}")
265
+
266
+ def _process_offset_pagination(
267
+ self, current_params: Dict[str, Any], items: List[Dict[str, Any]]
268
+ ) -> Tuple[bool, Dict[str, Any]]:
269
+ """Process offset-based pagination.
270
+
271
+ Args:
272
+ current_params (Dict[str, Any]): Current parameters
273
+ items (List[Dict[str, Any]]): Current items
274
+
275
+ Returns:
276
+ Tuple[bool, Dict[str, Any]]: (has_more, updated_params)
277
+ """
278
+ current_params["offset"] += self.page_size
279
+ # Auto-detect if we've reached the end
280
+ has_more = len(items) == self.page_size
281
+ return has_more, current_params
282
+
283
+ def _process_page_pagination(
284
+ self, current_params: Dict[str, Any], items: List[Dict[str, Any]]
285
+ ) -> Tuple[bool, Dict[str, Any]]:
286
+ """Process page-based pagination.
287
+
288
+ Args:
289
+ current_params (Dict[str, Any]): Current parameters
290
+ items (List[Dict[str, Any]]): Current items
291
+
292
+ Returns:
293
+ Tuple[bool, Dict[str, Any]]: (has_more, updated_params)
294
+ """
295
+ current_params["page"] += 1
296
+ # Auto-detect if we've reached the end
297
+ has_more = len(items) == self.page_size
298
+ return has_more, current_params
299
+
300
+ def _process_token_pagination(
301
+ self, result: Dict[str, Any], current_params: Dict[str, Any]
302
+ ) -> Tuple[bool, Dict[str, Any]]:
303
+ """Process token-based pagination.
304
+
305
+ Args:
306
+ result (Dict[str, Any]): Current response
307
+ current_params (Dict[str, Any]): Current parameters
308
+
309
+ Returns:
310
+ Tuple[bool, Dict[str, Any]]: (has_more, updated_params)
311
+ """
312
+ next_token = self._extract_next_token(result)
313
+ if next_token:
314
+ current_params["next_token"] = next_token
315
+ return True, current_params
316
+ return False, current_params
317
+
318
+ def _process_cursor_pagination(
319
+ self, result: Dict[str, Any], current_params: Dict[str, Any]
320
+ ) -> Tuple[bool, Dict[str, Any]]:
321
+ """Process cursor-based pagination.
322
+
323
+ Args:
324
+ result (Dict[str, Any]): Current response
325
+ current_params (Dict[str, Any]): Current parameters
326
+
327
+ Returns:
328
+ Tuple[bool, Dict[str, Any]]: (has_more, updated_params)
329
+ """
330
+ cursor = self._extract_cursor(result)
331
+ if cursor:
332
+ current_params["cursor"] = cursor
333
+ return True, current_params
334
+ return False, current_params
335
+
336
+ def _process_custom_pagination(
337
+ self,
338
+ result: Dict[str, Any],
339
+ current_params: Dict[str, Any],
340
+ next_page_extractor: Callable[[Dict[str, Any]], Optional[str]],
341
+ ) -> Tuple[bool, str, Dict[str, Any]]:
342
+ """Process custom pagination using extractor function.
343
+
344
+ Args:
345
+ result (Dict[str, Any]): Current response
346
+ current_params (Dict[str, Any]): Current parameters
347
+ next_page_extractor (Callable): Function to extract next page
348
+
349
+ Returns:
350
+ Tuple[bool, str, Dict[str, Any]]: (has_more, url, updated_params)
351
+ """
352
+ next_page = next_page_extractor(result)
353
+ if not next_page:
354
+ return False, "", current_params
355
+
356
+ # Validate URL if it's a full URL
357
+ if next_page.startswith(HTTPS_PREFIX):
358
+ return True, next_page, current_params
359
+ elif next_page.startswith(HTTP_PATTERN) and not self.ssl_verify:
360
+ # Only allow HTTP URLs when SSL verification is disabled
361
+ logger.warning("Using insecure HTTP URL for pagination")
362
+ return True, next_page, current_params
363
+ else:
364
+ # Just a token or path
365
+ current_params["next"] = next_page
366
+ return True, "", current_params
367
+
368
+ def _process_next_page(
369
+ self,
370
+ pagination_type: str,
371
+ result: Dict[str, Any],
372
+ current_params: Dict[str, Any],
373
+ items: List[Dict[str, Any]],
374
+ next_page_extractor: Optional[Callable[[Dict[str, Any]], Optional[str]]] = None,
375
+ ) -> Tuple[bool, str, Dict[str, Any]]:
376
+ """
377
+ Process pagination for the next page.
378
+
379
+ Args:
380
+ pagination_type (str): Type of pagination
381
+ result (Dict[str, Any]): Current page result
382
+ current_params (Dict[str, Any]): Current parameters
383
+ items (List[Dict[str, Any]]): Current page items
384
+ next_page_extractor (Optional[Callable]): Custom extractor function
385
+
386
+ Returns:
387
+ Tuple[bool, str, Dict[str, Any]]: (has_more, url, updated_params)
388
+ """
389
+ url = "" # Default empty URL (no change)
390
+
391
+ if pagination_type == "offset":
392
+ has_more, current_params = self._process_offset_pagination(current_params, items)
393
+ elif pagination_type == "page":
394
+ has_more, current_params = self._process_page_pagination(current_params, items)
395
+ elif pagination_type == "token":
396
+ has_more, current_params = self._process_token_pagination(result, current_params)
397
+ elif pagination_type == "cursor":
398
+ has_more, current_params = self._process_cursor_pagination(result, current_params)
399
+ elif pagination_type == "custom" and next_page_extractor:
400
+ has_more, url, current_params = self._process_custom_pagination(result, current_params, next_page_extractor)
401
+ else:
402
+ # Default - no more pages
403
+ has_more = False
404
+
405
+ return has_more, url, current_params
406
+
407
+ def _setup_pagination(
408
+ self, endpoint: str, pagination_type: str, params: Optional[Dict[str, Any]]
409
+ ) -> Tuple[str, Dict[str, Any]]:
410
+ """
411
+ Setup initial pagination state.
412
+
413
+ Args:
414
+ endpoint (str): API endpoint
415
+ pagination_type (str): Type of pagination
416
+ params (Optional[Dict[str, Any]]): Query parameters
417
+
418
+ Returns:
419
+ Tuple[str, Dict[str, Any]]: (url, current_params)
420
+ """
421
+ # Validate pagination type
422
+ if pagination_type not in ALLOWED_PAGINATION_TYPES:
423
+ raise ValidationException(f"Invalid pagination type: {pagination_type}")
424
+
425
+ # Build full URL and prepare parameters
426
+ url = f"{self.base_url}/{endpoint.lstrip('/')}"
427
+ current_params = self._prepare_pagination_params(pagination_type, params)
428
+
429
+ return url, current_params
430
+
431
+ def _process_result_page(
432
+ self, result: Dict[str, Any], data_path: Optional[str], output_mode: Optional[str]
433
+ ) -> Tuple[Optional[List[Dict[str, Any]]], Optional[str], int]:
434
+ """
435
+ Process a page of results.
436
+
437
+ Args:
438
+ result (Dict[str, Any]): API response
439
+ data_path (Optional[str]): Path to data
440
+ output_mode (Optional[str]): File mode for output
441
+
442
+ Returns:
443
+ Tuple[Optional[List[Dict[str, Any]]], Optional[str], int]:
444
+ (items, new_output_mode, item_count)
445
+ """
446
+ items = self._extract_data(result, data_path)
447
+ if not items:
448
+ return None, output_mode, 0
449
+
450
+ # Process items - either write to file or prepare to yield
451
+ if self.output_file and output_mode:
452
+ self._write_items_to_file(items, self.output_file, output_mode)
453
+ # Use append mode for subsequent pages
454
+ return items, APPEND_MODE, len(items)
455
+
456
+ # For streaming mode, items will be yielded by caller
457
+ return items, output_mode, len(items)
458
+
459
+ def _fetch_next_page(
460
+ self,
461
+ url: str,
462
+ current_params: Dict[str, Any],
463
+ request_method: str,
464
+ post_data: Optional[Dict[str, Any]] = None,
465
+ ) -> Optional[Dict[str, Any]]:
466
+ """
467
+ Fetch the next page of results.
468
+
469
+ Args:
470
+ url (str): URL to request
471
+ current_params (Dict[str, Any]): Current parameters
472
+ request_method (str): HTTP method
473
+ post_data (Optional[Dict[str, Any]]): Data for POST requests
474
+
475
+ Returns:
476
+ Optional[Dict[str, Any]]: The API response or None on error
477
+ """
478
+ return self._make_request(url, current_params, request_method, post_data)
479
+
480
+ def _yield_items(self, items: List[Dict[str, Any]]) -> Iterator[Dict[str, Any]]:
481
+ """
482
+ Yield items to the caller in streaming mode.
483
+
484
+ Args:
485
+ items (List[Dict[str, Any]]): Items to yield
486
+
487
+ Returns:
488
+ Iterator[Dict[str, Any]]: Iterator of items
489
+ """
490
+ for item in items:
491
+ yield item
492
+
493
+ def _should_continue_pagination(
494
+ self,
495
+ has_more: bool,
496
+ page_count: int,
497
+ ) -> bool:
498
+ """
499
+ Determine if pagination should continue.
500
+
501
+ Args:
502
+ has_more (bool): Whether there are more results
503
+ page_count (int): Current page count
504
+
505
+ Returns:
506
+ bool: True if pagination should continue
507
+ """
508
+ return has_more and (self.max_pages is None or page_count < self.max_pages)
509
+
510
+ def _log_pagination_progress(
511
+ self,
512
+ page_count: int,
513
+ item_count: int,
514
+ url: Optional[str] = None,
515
+ ) -> None:
516
+ """
517
+ Log pagination progress.
518
+
519
+ Args:
520
+ page_count (int): Current page count
521
+ item_count (int): Item count for current page
522
+ url (Optional[str]): URL for current request, optional for final log
523
+ """
524
+ if url:
525
+ logger.debug(f"Fetching page {page_count + 1} from {url}")
526
+ if item_count > 0:
527
+ logger.debug(f"Processed page {page_count} with {item_count} items")
528
+
529
+ def _log_pagination_complete(
530
+ self,
531
+ total_items: int,
532
+ page_count: int,
533
+ ) -> None:
534
+ """
535
+ Log completion of pagination.
536
+
537
+ Args:
538
+ total_items (int): Total number of items fetched
539
+ page_count (int): Total number of pages
540
+ """
541
+ logger.info(f"Completed pagination: {total_items} items in {page_count} pages")
542
+
543
+ def _process_single_page(
544
+ self,
545
+ url: str,
546
+ current_params: Dict[str, Any],
547
+ request_method: str,
548
+ post_data: Optional[Dict[str, Any]],
549
+ data_path: Optional[str],
550
+ pagination_type: str,
551
+ output_mode: Optional[str],
552
+ page_count: int,
553
+ next_page_extractor: Optional[Callable[[Dict[str, Any]], Optional[str]]] = None,
554
+ ) -> Tuple[bool, str, Dict[str, Any], Optional[List[Dict[str, Any]]], Optional[str], int]:
555
+ """
556
+ Process a single page of API results.
557
+
558
+ Args:
559
+ url (str): Current API URL
560
+ current_params (Dict[str, Any]): Current request parameters
561
+ request_method (str): HTTP method to use
562
+ post_data (Optional[Dict[str, Any]]): Data for POST requests
563
+ data_path (Optional[str]): Path to data in response
564
+ pagination_type (str): Type of pagination
565
+ output_mode (Optional[str]): Current file output mode
566
+ page_count (int): Current page counter
567
+ next_page_extractor (Optional[Callable]): Function to extract next page
568
+
569
+ Returns:
570
+ Tuple containing:
571
+ bool: Whether there are more pages
572
+ str: Next URL if applicable
573
+ Dict[str, Any]: Updated parameters
574
+ Optional[List[Dict[str, Any]]]: Items from this page
575
+ Optional[str]: Updated output mode
576
+ int: Number of items processed
577
+ """
578
+ # Log beginning of page fetch
579
+ self._log_pagination_progress(page_count, 0, url)
580
+
581
+ # Apply throttling if needed
582
+ self._apply_throttling(page_count)
583
+
584
+ # Fetch the page
585
+ result = self._fetch_next_page(url, current_params, request_method, post_data)
586
+ if not result:
587
+ return False, url, current_params, None, output_mode, 0
588
+
589
+ # Process the results
590
+ items, new_output_mode, item_count = self._process_result_page(result, data_path, output_mode)
591
+ if not items:
592
+ return False, url, current_params, None, output_mode, 0
593
+
594
+ # Update pagination for next page
595
+ has_more, next_url, updated_params = self._process_next_page(
596
+ pagination_type, result, current_params, items, next_page_extractor
597
+ )
598
+
599
+ # Log page processed
600
+ self._log_pagination_progress(page_count, item_count, url)
601
+
602
+ return has_more, next_url, updated_params, items, new_output_mode, item_count
603
+
604
+ def fetch_paginated_results(
605
+ self,
606
+ endpoint: str,
607
+ params: Optional[Dict[str, Any]] = None,
608
+ data_path: Optional[str] = None,
609
+ pagination_type: str = "offset",
610
+ next_page_extractor: Optional[Callable[[Dict[str, Any]], Optional[str]]] = None,
611
+ request_method: str = "GET",
612
+ post_data: Optional[Dict[str, Any]] = None,
613
+ ) -> Iterator[Dict[str, Any]]:
614
+ """
615
+ Fetch all pages of results from the API endpoint.
616
+
617
+ Args:
618
+ endpoint (str): API endpoint path (will be appended to base_url)
619
+ params (Optional[Dict[str, Any]]): Query parameters for the request
620
+ data_path (Optional[str]): JSON path to the data array within the response
621
+ pagination_type (str): Type of pagination: "offset", "page", "token", or "cursor"
622
+ next_page_extractor (Optional[Callable]): Function to extract next page URL/token
623
+ request_method (str): HTTP method to use ("GET" or "POST")
624
+ post_data (Optional[Dict[str, Any]]): JSON data to send with POST requests
625
+
626
+ Returns:
627
+ Iterator[Dict[str, Any]]: Iterator yielding each result item
628
+
629
+ Raises:
630
+ ValidationException: If an invalid pagination type is provided
631
+ """
632
+ # Setup initial pagination state
633
+ url, current_params = self._setup_pagination(endpoint, pagination_type, params)
634
+ current_post_data = post_data.copy() if post_data else {}
635
+ page_count = 0
636
+ total_items = 0
637
+ has_more = True
638
+
639
+ # Use file or memory for storing results
640
+ output_mode = WRITE_MODE if self.output_file else None
641
+
642
+ # Main pagination loop
643
+ while self._should_continue_pagination(has_more, page_count):
644
+ # Process a single page
645
+ has_more, next_url, current_params, items, output_mode, item_count = self._process_single_page(
646
+ url=url,
647
+ current_params=current_params,
648
+ request_method=request_method,
649
+ post_data=current_post_data,
650
+ data_path=data_path,
651
+ pagination_type=pagination_type,
652
+ output_mode=output_mode,
653
+ page_count=page_count,
654
+ next_page_extractor=next_page_extractor,
655
+ )
656
+
657
+ # If no items processed, we're done
658
+ if not items or item_count == 0:
659
+ break
660
+
661
+ # In streaming mode, yield items directly
662
+ if not self.output_file:
663
+ for item in items:
664
+ yield item
665
+
666
+ # Update URL if changed
667
+ if next_url:
668
+ url = next_url
669
+
670
+ # Update counters
671
+ total_items += item_count
672
+ page_count += 1
673
+
674
+ # Log completion
675
+ self._log_pagination_complete(total_items, page_count)
676
+
677
+ # If writing to file, read back as iterator
678
+ if self.output_file:
679
+ yield from self.read_jsonl_file(self.output_file)
680
+
681
+ def _create_endpoint_fetch_task(
682
+ self, endpoint: str, params: Optional[Dict[str, Any]], data_path: Optional[str], request_method: str
683
+ ) -> Callable[[], List[Dict[str, Any]]]:
684
+ """
685
+ Create a callable task for fetching a single endpoint.
686
+
687
+ Args:
688
+ endpoint (str): API endpoint
689
+ params (Optional[Dict[str, Any]]): Query parameters
690
+ data_path (Optional[str]): Path to data
691
+ request_method (str): HTTP method
692
+
693
+ Returns:
694
+ Callable[[], List[Dict[str, Any]]]: Task function
695
+ """
696
+
697
+ def task() -> List[Dict[str, Any]]:
698
+ results = []
699
+ for item in self.fetch_paginated_results(
700
+ endpoint=endpoint,
701
+ params=params,
702
+ data_path=data_path,
703
+ request_method=request_method,
704
+ ):
705
+ results.append(item)
706
+ return results
707
+
708
+ return task
709
+
710
+ def _process_concurrent_results(self, futures: List, use_output_file: Optional[str]) -> Iterator[Dict[str, Any]]:
711
+ """
712
+ Process results from concurrent endpoint fetches.
713
+
714
+ Args:
715
+ futures (List): List of Future objects
716
+ use_output_file (Optional[str]): Output file path
717
+
718
+ Returns:
719
+ Iterator[Dict[str, Any]]: Iterator of results
720
+ """
721
+ file_mode = WRITE_MODE if use_output_file else None
722
+
723
+ for future in futures:
724
+ try:
725
+ results = future.result()
726
+ if not results:
727
+ continue
728
+
729
+ if use_output_file:
730
+ self._write_items_to_file(results, use_output_file, file_mode)
731
+ # Use append mode for subsequent endpoints
732
+ file_mode = APPEND_MODE
733
+ else:
734
+ for item in results:
735
+ yield item
736
+ except Exception as e:
737
+ logger.error(f"Error in concurrent fetch: {str(e)}")
738
+
739
+ def fetch_all_concurrent(
740
+ self,
741
+ endpoints: List[str],
742
+ params: Optional[Dict[str, Any]] = None,
743
+ data_path: Optional[str] = None,
744
+ request_method: str = "GET",
745
+ output_file: Optional[str] = None,
746
+ ) -> Iterator[Dict[str, Any]]:
747
+ """
748
+ Fetch multiple endpoints concurrently and combine results.
749
+
750
+ Args:
751
+ endpoints (List[str]): List of API endpoint paths
752
+ params (Optional[Dict[str, Any]]): Query parameters for the requests
753
+ data_path (Optional[str]): JSON path to the data array within the response
754
+ request_method (str): HTTP method to use ("GET" or "POST")
755
+ output_file (Optional[str]): Override the instance output_file
756
+
757
+ Returns:
758
+ Iterator[Dict[str, Any]]: Iterator yielding each result item
759
+ """
760
+ use_output_file = output_file or self.output_file
761
+
762
+ # Create tasks for each endpoint
763
+ tasks = [
764
+ self._create_endpoint_fetch_task(endpoint, params, data_path, request_method) for endpoint in endpoints
765
+ ]
766
+
767
+ # Execute tasks concurrently
768
+ with ThreadPoolExecutor(max_workers=self.concurrent_requests) as executor:
769
+ # Start all fetch tasks
770
+ futures = [executor.submit(task) for task in tasks]
771
+
772
+ # Process results as they complete
773
+ yield from self._process_concurrent_results(futures, use_output_file)
774
+
775
+ # If we're writing to a file, read it back as an iterator
776
+ if use_output_file:
777
+ yield from self.read_jsonl_file(use_output_file)
778
+
779
+ def _extract_next_token(self, response: Dict[str, Any]) -> Optional[str]:
780
+ """
781
+ Extract the next token from a response.
782
+
783
+ This method tries several common patterns for next token references.
784
+
785
+ Args:
786
+ response (Dict[str, Any]): The API response
787
+
788
+ Returns:
789
+ Optional[str]: The next token or None if not found
790
+ """
791
+ # Try common patterns for next token
792
+ token_paths = [
793
+ ["nextToken"],
794
+ ["next_token"],
795
+ ["pagination", "nextToken"],
796
+ ["meta", "next_token"],
797
+ ["paging", "next"],
798
+ ["links", "next"],
799
+ ]
800
+
801
+ return self._extract_value_from_paths(response, token_paths)
802
+
803
+ def _extract_cursor(self, response: Dict[str, Any]) -> Optional[str]:
804
+ """
805
+ Extract the cursor from a response.
806
+
807
+ This method tries several common patterns for cursor references.
808
+
809
+ Args:
810
+ response (Dict[str, Any]): The API response
811
+
812
+ Returns:
813
+ Optional[str]: The cursor or None if not found
814
+ """
815
+ # Try common patterns for cursor
816
+ cursor_paths = [
817
+ ["cursor"],
818
+ ["page", "cursor"],
819
+ ["meta", "cursor"],
820
+ ["paging", "cursors", "after"],
821
+ ["pagination", "cursor"],
822
+ ]
823
+
824
+ return self._extract_value_from_paths(response, cursor_paths)
825
+
826
+ def _extract_value_from_path(self, data: Dict[str, Any], path: List[str]) -> Optional[str]:
827
+ """
828
+ Extract a value from a nested dictionary using a single path.
829
+
830
+ Args:
831
+ data (Dict[str, Any]): The dictionary to search
832
+ path (List[str]): Path to the value
833
+
834
+ Returns:
835
+ Optional[str]: The found value or None
836
+ """
837
+ value = data
838
+ try:
839
+ for key in path:
840
+ if key in value:
841
+ value = value[key]
842
+ else:
843
+ return None
844
+ if value and isinstance(value, (str, int)):
845
+ return str(value)
846
+ except (KeyError, TypeError):
847
+ pass
848
+ return None
849
+
850
+ def _extract_value_from_paths(self, data: Dict[str, Any], paths: List[List[str]]) -> Optional[str]:
851
+ """
852
+ Extract a value from a nested dictionary using multiple possible paths.
853
+
854
+ Args:
855
+ data (Dict[str, Any]): The dictionary to search
856
+ paths (List[List[str]]): List of possible path lists to the value
857
+
858
+ Returns:
859
+ Optional[str]: The found value or None
860
+ """
861
+ for path in paths:
862
+ value = self._extract_value_from_path(data, path)
863
+ if value:
864
+ return value
865
+ return None
866
+
867
+ @staticmethod
868
+ def _parse_jsonl_line(line: str) -> Optional[Dict[str, Any]]:
869
+ """
870
+ Parse a single JSONL line.
871
+
872
+ Args:
873
+ line (str): Line to parse
874
+
875
+ Returns:
876
+ Optional[Dict[str, Any]]: Parsed JSON or None on error
877
+ """
878
+ line = line.strip()
879
+ if not line: # Skip empty lines
880
+ return None
881
+
882
+ try:
883
+ return json.loads(line)
884
+ except json.JSONDecodeError as e:
885
+ logger.error(f"Error parsing JSON line: {str(e)}")
886
+ logger.debug(f"Problematic line: {line}")
887
+ return None
888
+
889
+ @staticmethod
890
+ def read_jsonl_file(file_path: str) -> Iterator[Dict[str, Any]]:
891
+ """
892
+ Read a JSONL file and yield each line as a parsed JSON object.
893
+
894
+ Args:
895
+ file_path (str): Path to the JSONL file
896
+
897
+ Returns:
898
+ Iterator[Dict[str, Any]]: Iterator of parsed JSON objects
899
+ """
900
+ try:
901
+ with open(file_path, "r") as f:
902
+ for line in f:
903
+ parsed = ApiPaginator._parse_jsonl_line(line)
904
+ if parsed:
905
+ yield parsed
906
+ except FileNotFoundError:
907
+ logger.error(f"File not found: {file_path}")
908
+ except IOError as e:
909
+ logger.error(f"IO error reading file {file_path}: {str(e)}")
910
+
911
+ def get_output_file_path(self) -> Optional[str]:
912
+ """
913
+ Get the path to the output file.
914
+
915
+ Returns:
916
+ Optional[str]: Path to the output file or None if not set
917
+ """
918
+ return self.output_file
919
+
920
+ def clear_output_file(self) -> None:
921
+ """
922
+ Clear the output file if it exists.
923
+ """
924
+ if not self.output_file:
925
+ return
926
+
927
+ if os.path.exists(self.output_file):
928
+ try:
929
+ os.remove(self.output_file)
930
+ logger.debug(f"Cleared output file: {self.output_file}")
931
+ except OSError as e:
932
+ logger.error(f"Error clearing output file {self.output_file}: {str(e)}")