regscale-cli 6.16.1.0__py3-none-any.whl → 6.16.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of regscale-cli might be problematic. Click here for more details.

Files changed (46) hide show
  1. regscale/__init__.py +1 -1
  2. regscale/core/app/internal/login.py +1 -1
  3. regscale/core/app/internal/poam_editor.py +1 -1
  4. regscale/core/app/utils/api_handler.py +4 -11
  5. regscale/integrations/commercial/__init__.py +2 -2
  6. regscale/integrations/commercial/ad.py +1 -1
  7. regscale/integrations/commercial/crowdstrike.py +0 -1
  8. regscale/integrations/commercial/grype/__init__.py +3 -0
  9. regscale/integrations/commercial/grype/commands.py +72 -0
  10. regscale/integrations/commercial/grype/scanner.py +390 -0
  11. regscale/integrations/commercial/import_all/import_all_cmd.py +2 -2
  12. regscale/integrations/commercial/opentext/__init__.py +6 -0
  13. regscale/integrations/commercial/opentext/commands.py +77 -0
  14. regscale/integrations/commercial/opentext/scanner.py +449 -85
  15. regscale/integrations/commercial/qualys.py +50 -61
  16. regscale/integrations/commercial/servicenow.py +1 -0
  17. regscale/integrations/commercial/snyk.py +2 -2
  18. regscale/integrations/commercial/synqly/ticketing.py +29 -0
  19. regscale/integrations/commercial/trivy/__init__.py +5 -0
  20. regscale/integrations/commercial/trivy/commands.py +74 -0
  21. regscale/integrations/commercial/trivy/scanner.py +276 -0
  22. regscale/integrations/commercial/veracode.py +1 -1
  23. regscale/integrations/commercial/wizv2/utils.py +1 -1
  24. regscale/integrations/jsonl_scanner_integration.py +869 -0
  25. regscale/integrations/public/fedramp/fedramp_common.py +4 -4
  26. regscale/integrations/public/fedramp/inventory_items.py +3 -3
  27. regscale/integrations/scanner_integration.py +225 -59
  28. regscale/models/integration_models/cisa_kev_data.json +65 -7
  29. regscale/models/integration_models/{flat_file_importer.py → flat_file_importer/__init__.py} +29 -8
  30. regscale/models/integration_models/snyk.py +141 -15
  31. regscale/models/integration_models/synqly_models/capabilities.json +1 -1
  32. regscale/models/integration_models/tenable_models/integration.py +42 -7
  33. regscale/models/integration_models/veracode.py +91 -48
  34. regscale/models/regscale_models/regscale_model.py +1 -1
  35. regscale/models/regscale_models/user.py +3 -4
  36. regscale/models/regscale_models/vulnerability.py +21 -0
  37. regscale/utils/version.py +3 -5
  38. {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.3.0.dist-info}/METADATA +3 -3
  39. {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.3.0.dist-info}/RECORD +43 -38
  40. regscale/integrations/commercial/grype.py +0 -165
  41. regscale/integrations/commercial/opentext/click.py +0 -99
  42. regscale/integrations/commercial/trivy.py +0 -162
  43. {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.3.0.dist-info}/LICENSE +0 -0
  44. {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.3.0.dist-info}/WHEEL +0 -0
  45. {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.3.0.dist-info}/entry_points.txt +0 -0
  46. {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,869 @@
1
+ """
2
+ Abstract base class for scanner integrations that use JSONL files for intermediate storage.
3
+ """
4
+
5
+ import dataclasses
6
+ import json
7
+ import logging
8
+ import os
9
+ import shutil
10
+ import tempfile
11
+ from typing import Any, Dict, Iterator, Optional, Union, Tuple, TypeVar, Type, List
12
+
13
+ import boto3
14
+ from pathlib import Path
15
+
16
+ from regscale.core.app.utils.file_utils import is_s3_path, read_file, find_files, download_from_s3
17
+ from regscale.exceptions import ValidationException
18
+ from regscale.integrations.scanner_integration import IntegrationAsset, IntegrationFinding, ScannerIntegration
19
+ from regscale.models.app_models.mapping import Mapping
20
+
21
+ logger = logging.getLogger("regscale")
22
+
23
+ # Define generic types for items that can be written to file
24
+ T = TypeVar("T")
25
+ ItemType = TypeVar("ItemType", IntegrationAsset, IntegrationFinding)
26
+
27
+
28
+ class JSONLScannerIntegration(ScannerIntegration):
29
+ """
30
+ Abstract base class for scanner integrations that use JSONL files for intermediate storage.
31
+
32
+ This class extends ScannerIntegration to provide common functionality for scanners
33
+ that process source files (local or S3) and store the results in JSONL files before syncing to RegScale.
34
+ Supports reading files directly without downloading when read_files_only is True.
35
+
36
+ Subclasses must implement:
37
+ - find_valid_files: To find valid source files
38
+ - parse_asset: To parse an asset from a source file
39
+ - parse_finding: To parse a finding from a source file
40
+ - is_valid_file: To validate a file before processing
41
+ """
42
+
43
+ # Constants for file paths - subclasses should override these
44
+ ASSETS_FILE = "./artifacts/assets.jsonl"
45
+ FINDINGS_FILE = "./artifacts/findings.jsonl"
46
+
47
+ def __init__(self, *args, **kwargs):
48
+ """
49
+ Initialize the JSONLScannerIntegration.
50
+ """
51
+ # Extract S3-related kwargs
52
+ self.s3_bucket = kwargs.pop("s3_bucket", None)
53
+ self.s3_prefix = kwargs.pop("s3_prefix", "")
54
+ self.aws_profile = kwargs.pop("aws_profile", "default")
55
+
56
+ self.plan_id = kwargs.pop("plan_id", None)
57
+ self.file_path = kwargs.pop("file_path", None)
58
+ self.empty_files: bool = True
59
+ self.scan_date = kwargs.pop("scan_date", None)
60
+ self.download_destination = kwargs.pop("destination", None)
61
+ self.file_pattern = kwargs.pop("file_pattern", "*.json")
62
+ self.read_files_only = kwargs.pop("read_files_only", False)
63
+
64
+ # Extract mapping-related kwargs
65
+ self.disable_mapping = kwargs.pop("disable_mapping", False)
66
+ self.mapping_path = kwargs.pop("mapping_path", f"./mappings/{self.__class__.__name__.lower()}/mapping.json")
67
+ self.required_asset_fields = kwargs.pop("required_asset_fields", ["identifier", "name"])
68
+ self.required_finding_fields = kwargs.pop("required_finding_fields", ["asset_identifier", "title", "severity"])
69
+ self.mapping = self._load_mapping() if not self.disable_mapping else None
70
+
71
+ # Initialize parent class
72
+ super().__init__(plan_id=self.plan_id, **kwargs)
73
+
74
+ self.s3_client = None
75
+ if self.s3_bucket and not self.read_files_only:
76
+ try:
77
+ session = boto3.Session(profile_name=self.aws_profile)
78
+ self.s3_client = session.client("s3")
79
+ except Exception as e:
80
+ logger.error(f"Failed to initialize S3 client with profile {self.aws_profile}: {str(e)}")
81
+ raise ValidationException(f"S3 client initialization failed: {str(e)}")
82
+
83
+ def _load_mapping(self) -> Optional[Mapping]:
84
+ """Load the mapping configuration from a JSON file."""
85
+ try:
86
+ mapping_file = Path(self.mapping_path)
87
+ if mapping_file.exists():
88
+ with mapping_file.open("r") as f:
89
+ mapping_data = json.load(f)
90
+ return Mapping(**mapping_data)
91
+ logger.debug(f"No mapping file found at {self.mapping_path}, using default mapping")
92
+ return None
93
+ except Exception as e:
94
+ logger.error(f"Error loading mapping file {self.mapping_path}: {str(e)}")
95
+ return None
96
+
97
+ def _apply_mapping(
98
+ self, source_data: Dict[str, Any], target_fields: Dict[str, Any], mapping: Dict[str, str]
99
+ ) -> Dict[str, Any]:
100
+ """Apply field mapping from source data to target fields."""
101
+ mapped_data = target_fields.copy()
102
+
103
+ if self.disable_mapping or not self.mapping or not hasattr(self.mapping, "fields"):
104
+ return {**source_data, **mapped_data}
105
+
106
+ for target_field, source_field in mapping.items():
107
+ if source_field in source_data:
108
+ mapped_data[target_field] = source_data[source_field]
109
+ elif isinstance(source_field, dict) and "default" in source_field:
110
+ mapped_data[target_field] = source_field["default"]
111
+
112
+ return mapped_data
113
+
114
+ def _validate_fields(self, item: Union[IntegrationAsset, IntegrationFinding], required_fields: list) -> None:
115
+ """Validate that all required fields are present and non-empty."""
116
+ missing_fields = []
117
+ item_dict = dataclasses.asdict(item)
118
+
119
+ for field in required_fields:
120
+ if field not in item_dict or not item_dict[field]:
121
+ missing_fields.append(field)
122
+
123
+ if missing_fields:
124
+ item_type = "asset" if isinstance(item, IntegrationAsset) else "finding"
125
+ raise ValueError(f"Missing or empty required fields for {item_type}: {', '.join(missing_fields)}")
126
+
127
+ def create_artifacts_dir(self) -> Path:
128
+ """Create artifacts directory if it doesn't exist."""
129
+ artifacts_dir = Path("./artifacts")
130
+ artifacts_dir.mkdir(exist_ok=True, parents=True)
131
+ return artifacts_dir
132
+
133
+ def _get_item_key(self, item_dict: Dict[str, Any], item_type: str) -> str:
134
+ """Generate a unique key for an item (asset or finding) dictionary."""
135
+ if item_type == "asset":
136
+ return item_dict.get("identifier", "unknown")
137
+ else: # finding
138
+ asset_id = item_dict.get("asset_identifier", "unknown")
139
+ cve = item_dict.get("cve", "")
140
+ title = item_dict.get("title", "")
141
+ if cve:
142
+ return f"{asset_id}:{cve}"
143
+ return f"{asset_id}:{title}"
144
+
145
+ def _prepare_output_file(self, output_file: str, empty_file: bool, item_type: str) -> Dict[str, bool]:
146
+ """Prepare output file and load existing records if necessary."""
147
+ existing_items: Dict[str, bool] = {}
148
+
149
+ if empty_file and os.path.exists(output_file):
150
+ logger.info(f"Emptying existing file: {output_file}")
151
+ open(output_file, "w").close()
152
+ elif os.path.exists(output_file) and os.path.getsize(output_file) > 0:
153
+ logger.info(f"Reading existing records from: {output_file}")
154
+ try:
155
+ with open(output_file, "r") as f:
156
+ for line in f:
157
+ try:
158
+ record = json.loads(line.strip())
159
+ key = self._get_item_key(record, item_type)
160
+ existing_items[key] = True
161
+ except json.JSONDecodeError:
162
+ logger.warning(f"Could not parse line in {output_file}")
163
+ except Exception as e:
164
+ logger.warning(f"Error reading existing records: {str(e)}")
165
+
166
+ return existing_items
167
+
168
+ def _write_items_to_jsonl(
169
+ self,
170
+ file_path: str,
171
+ output_file: str,
172
+ item_type: str,
173
+ empty_file: bool = True,
174
+ ) -> int:
175
+ """
176
+ Process source files (local or S3) and write items (assets or findings) to JSONL.
177
+
178
+ :param str file_path: Path to source file or directory (local or S3 URI)
179
+ :param str output_file: Path to output JSONL file
180
+ :param str item_type: Type of items to process ('asset' or 'finding')
181
+ :param bool empty_file: Whether to empty the output file before writing (default: True)
182
+ :return: Total count of items written
183
+ :rtype: int
184
+ """
185
+ existing_items = self._prepare_output_file(output_file, empty_file, item_type)
186
+ total_items_count = len(existing_items)
187
+ processed_files = set()
188
+ new_items_count = 0
189
+
190
+ with open(output_file, "a") as output_f:
191
+ for file_data in self.find_valid_files(file_path):
192
+ if isinstance(file_data, tuple) and len(file_data) >= 2:
193
+ file, data = file_data[0], file_data[1]
194
+ else:
195
+ file, data = file_data, None
196
+
197
+ file_str = str(file)
198
+ if file_str in processed_files:
199
+ continue
200
+
201
+ processed_files.add(file_str)
202
+
203
+ try:
204
+ logger.info(f"Processing file: {file}")
205
+ if item_type == "asset":
206
+ self._process_asset_file(file, data, output_f, existing_items)
207
+ new_items_count += 1
208
+ total_items_count += 1
209
+ else:
210
+ findings_count = self._process_finding_file(file, data, output_f, existing_items)
211
+ new_items_count += findings_count
212
+ total_items_count += findings_count
213
+
214
+ except Exception as e:
215
+ logger.error(f"Error processing file {file}: {str(e)}")
216
+
217
+ item_type_label = "assets" if item_type == "asset" else "findings"
218
+ logger.info(f"Added {new_items_count} new {item_type_label} to {output_file}")
219
+ return total_items_count
220
+
221
+ def _process_asset_file(self, file, data, output_f, existing_items):
222
+ """
223
+ Process a single file for assets with mapping and validation.
224
+
225
+ :param file: The file being processed
226
+ :param data: The data from the file
227
+ :param output_f: The output file handle
228
+ :param existing_items: Dictionary of existing items
229
+ :return: Number of assets processed
230
+ :rtype: int
231
+ """
232
+ asset = self.parse_asset(file, data)
233
+ asset_dict = dataclasses.asdict(asset)
234
+
235
+ if not self.disable_mapping:
236
+ mapped_asset_dict = self._apply_mapping(
237
+ data or {},
238
+ asset_dict,
239
+ getattr(self.mapping, "fields", {}).get("asset_mapping", {}) if self.mapping else {},
240
+ )
241
+ mapped_asset = IntegrationAsset(**mapped_asset_dict)
242
+ else:
243
+ mapped_asset = asset
244
+
245
+ self._validate_fields(mapped_asset, self.required_asset_fields)
246
+
247
+ key = self._get_item_key(dataclasses.asdict(mapped_asset), "asset")
248
+ if key in existing_items:
249
+ logger.debug(f"Asset with identifier {key} already exists, skipping")
250
+ return 0
251
+
252
+ output_f.write(json.dumps(dataclasses.asdict(mapped_asset)) + "\n")
253
+ output_f.flush()
254
+ existing_items[key] = True
255
+ return 1
256
+
257
+ def _process_finding_file(self, file, data, output_f, existing_items):
258
+ """
259
+ Process a single file for findings with mapping and validation.
260
+
261
+ :param file: The file being processed
262
+ :param data: The data from the file
263
+ :param output_f: The output file handle
264
+ :param existing_items: Dictionary of existing items
265
+ :return: Number of findings processed
266
+ :rtype: int
267
+ """
268
+ asset = self.parse_asset(file, data)
269
+ identifier = asset.identifier
270
+ findings_data = self._get_findings_data_from_file(data)
271
+
272
+ findings_in_file = 0
273
+ for finding_item in findings_data:
274
+ finding = self.parse_finding(identifier, data, finding_item)
275
+ finding_dict = dataclasses.asdict(finding)
276
+
277
+ if not self.disable_mapping:
278
+ mapped_finding_dict = self._apply_mapping(
279
+ finding_item,
280
+ finding_dict,
281
+ getattr(self.mapping, "fields", {}).get("finding_mapping", {}) if self.mapping else {},
282
+ )
283
+ mapped_finding = IntegrationFinding(**mapped_finding_dict)
284
+ else:
285
+ mapped_finding = finding
286
+
287
+ self._validate_fields(mapped_finding, self.required_finding_fields)
288
+
289
+ key = self._get_item_key(dataclasses.asdict(mapped_finding), "finding")
290
+ if key in existing_items:
291
+ logger.debug(f"Finding with key {key} already exists, skipping")
292
+ continue
293
+
294
+ output_f.write(json.dumps(dataclasses.asdict(mapped_finding)) + "\n")
295
+ output_f.flush()
296
+ existing_items[key] = True
297
+ findings_in_file += 1
298
+
299
+ if findings_in_file > 0:
300
+ logger.info(f"Added {findings_in_file} new findings from file {file}")
301
+ return findings_in_file
302
+
303
+ def _get_findings_data_from_file(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
304
+ """
305
+ Extract findings data from file data (default implementation).
306
+
307
+ Subclasses must override this method to extract findings data from their specific file format.
308
+
309
+ :param Dict[str, Any] data: The data from the file
310
+ :return: Iterable of finding items
311
+ """
312
+ # Default implementation returns an empty list
313
+ # Subclasses must override this method
314
+ return []
315
+
316
+ def _yield_items_from_jsonl(self, jsonl_file: str, item_class: Type[ItemType]) -> Iterator[ItemType]:
317
+ """
318
+ Read items from JSONL file and yield them one by one.
319
+
320
+ :param str jsonl_file: Path to JSONL file containing items
321
+ :param Type[ItemType] item_class: Class to convert dictionary items to (IntegrationAsset or IntegrationFinding)
322
+ :yields: Items one at a time
323
+ :rtype: Iterator[ItemType]
324
+ """
325
+ if not os.path.exists(jsonl_file):
326
+ logger.warning(f"JSONL file {jsonl_file} does not exist")
327
+ return
328
+
329
+ logger.info(f"Reading items from {jsonl_file}")
330
+ with open(jsonl_file, "r") as f:
331
+ for line_number, line in enumerate(f, 1):
332
+ try:
333
+ item_dict = json.loads(line.strip())
334
+ yield item_class(**item_dict)
335
+ except json.JSONDecodeError:
336
+ logger.warning(f"Could not parse line {line_number} in {jsonl_file}")
337
+ except Exception as e:
338
+ logger.error(f"Error processing line {line_number} in {jsonl_file}: {str(e)}")
339
+
340
+ def _process_files(
341
+ self,
342
+ file_path: Union[str, Path],
343
+ assets_output_file: str,
344
+ findings_output_file: str,
345
+ empty_assets_file: bool = True,
346
+ empty_findings_file: bool = True,
347
+ ) -> Tuple[int, int]:
348
+ """
349
+ Process files (local or S3) to extract both assets and findings in a single pass.
350
+
351
+ Optimizes file processing by reading each file once to extract asset and finding data.
352
+
353
+ :param Union[str, Path] file_path: Path to source file or directory (local or S3 URI)
354
+ :param str assets_output_file: Path to output JSONL file for assets
355
+ :param str findings_output_file: Path to output JSONL file for findings
356
+ :param bool empty_assets_file: Whether to empty the assets file before writing (default: True)
357
+ :param bool empty_findings_file: Whether to empty the findings file before writing (default: True)
358
+ :return: Tuple of total asset and finding counts
359
+ :rtype: Tuple[int, int]
360
+ """
361
+ existing_assets = self._prepare_output_file(assets_output_file, empty_assets_file, "asset")
362
+ existing_findings = self._prepare_output_file(findings_output_file, empty_findings_file, "finding")
363
+
364
+ asset_tracker = self._init_tracker(existing_assets)
365
+ finding_tracker = self._init_tracker(existing_findings)
366
+ processed_files = set()
367
+
368
+ with open(assets_output_file, "a") as assets_file, open(findings_output_file, "a") as findings_file:
369
+ for file, data in self._get_valid_file_data(file_path):
370
+ if str(file) in processed_files:
371
+ continue
372
+
373
+ processed_files.add(str(file))
374
+ self._process_file(file, data, assets_file, findings_file, asset_tracker, finding_tracker)
375
+
376
+ self._log_processing_results(asset_tracker.new_count, assets_output_file, "assets")
377
+ self._log_processing_results(finding_tracker.new_count, findings_output_file, "findings")
378
+ return asset_tracker.total_count, finding_tracker.total_count
379
+
380
+ def _init_tracker(self, existing_items: Dict[str, bool]) -> "CountTracker":
381
+ """
382
+ Initialize a tracker for counting new and total items.
383
+
384
+ :param Dict[str, bool] existing_items: Dictionary of existing item keys
385
+ :return: Tracker object for managing counts
386
+ :rtype: CountTracker
387
+ """
388
+ from dataclasses import dataclass
389
+
390
+ @dataclass
391
+ class CountTracker:
392
+ existing: Dict[str, bool]
393
+ new_count: int = 0
394
+ total_count: int = 0
395
+
396
+ return CountTracker(existing=existing_items, total_count=len(existing_items))
397
+
398
+ def _get_valid_file_data(
399
+ self, file_path: Union[str, Path]
400
+ ) -> Iterator[Tuple[Union[Path, str], Optional[Dict[str, Any]]]]:
401
+ """
402
+ Yield valid file data from the given path.
403
+
404
+ :param Union[str, Path] file_path: Path to source file or directory (local or S3 URI)
405
+ :return: Iterator yielding tuples of (file path, parsed data)
406
+ :rtype: Iterator[Tuple[Union[Path, str], Optional[Dict[str, Any]]]]
407
+ """
408
+ for file_data in self.find_valid_files(file_path):
409
+ if isinstance(file_data, tuple) and len(file_data) >= 2:
410
+ yield file_data[0], file_data[1]
411
+ else:
412
+ yield file_data, None
413
+
414
+ def _process_file(
415
+ self,
416
+ file: Union[Path, str],
417
+ data: Optional[Dict[str, Any]],
418
+ assets_file: Any,
419
+ findings_file: Any,
420
+ asset_tracker: "CountTracker",
421
+ finding_tracker: "CountTracker",
422
+ ) -> None:
423
+ """
424
+ Process a single file for assets and findings.
425
+
426
+ :param Union[Path, str] file: Path to the file being processed
427
+ :param Optional[Dict[str, Any]] data: Parsed data from the file
428
+ :param Any assets_file: Open file handle for writing assets
429
+ :param Any findings_file: Open file handle for writing findings
430
+ :param CountTracker asset_tracker: Tracker for asset counts
431
+ :param CountTracker finding_tracker: Tracker for finding counts
432
+ :rtype: None
433
+ """
434
+ try:
435
+ logger.info(f"Processing file: {file}")
436
+ self._process_asset(file, data, assets_file, asset_tracker)
437
+ self._process_findings(file, data, findings_file, asset_tracker.existing, finding_tracker)
438
+ except Exception as e:
439
+ logger.error(f"Error processing file {file}: {str(e)}")
440
+
441
+ def _process_asset(
442
+ self,
443
+ file: Union[Path, str],
444
+ data: Optional[Dict[str, Any]],
445
+ assets_file: Any,
446
+ tracker: "CountTracker",
447
+ ) -> None:
448
+ """
449
+ Process and write a single asset from file data.
450
+
451
+ :param Union[Path, str] file: Path to the file being processed
452
+ :param Optional[Dict[str, Any]] data: Parsed data from the file
453
+ :param Any assets_file: Open file handle for writing assets
454
+ :param CountTracker tracker: Tracker for asset counts
455
+ :rtype: None
456
+ """
457
+ asset = self.parse_asset(file, data)
458
+ asset_dict = dataclasses.asdict(asset)
459
+ mapped_asset = self._map_item(asset_dict, "asset_mapping", IntegrationAsset)
460
+ self._validate_fields(mapped_asset, self.required_asset_fields)
461
+
462
+ asset_key = mapped_asset.identifier
463
+ if asset_key not in tracker.existing:
464
+ self._write_item(assets_file, mapped_asset)
465
+ tracker.existing[asset_key] = True
466
+ tracker.new_count += 1
467
+ tracker.total_count += 1
468
+ else:
469
+ logger.debug(f"Asset with identifier {asset_key} already exists, skipping")
470
+
471
+ def _process_findings(
472
+ self,
473
+ file: Union[Path, str],
474
+ data: Optional[Dict[str, Any]],
475
+ findings_file: Any,
476
+ existing_assets: Dict[str, bool],
477
+ tracker: "CountTracker",
478
+ ) -> None:
479
+ """
480
+ Process and write findings from file data.
481
+
482
+ :param Union[Path, str] file: Path to the file being processed
483
+ :param Optional[Dict[str, Any]] data: Parsed data from the file
484
+ :param Any findings_file: Open file handle for writing findings
485
+ :param Dict[str, bool] existing_assets: Dictionary of existing asset keys
486
+ :param CountTracker tracker: Tracker for finding counts
487
+ :rtype: None
488
+ """
489
+ findings_data = self._get_findings_data_from_file(data)
490
+ logger.info(f"Found {len(findings_data)} findings in file: {file}")
491
+ findings_added = 0
492
+
493
+ asset_id = list(existing_assets.keys())[0] if existing_assets else "unknown"
494
+ for finding_item in findings_data:
495
+ finding = self.parse_finding(asset_id, data, finding_item)
496
+ finding_dict = dataclasses.asdict(finding)
497
+ mapped_finding = self._map_item(finding_dict, "finding_mapping", IntegrationFinding)
498
+ self._validate_fields(mapped_finding, self.required_finding_fields)
499
+
500
+ finding_key = self._get_item_key(dataclasses.asdict(mapped_finding), "finding")
501
+ if finding_key not in tracker.existing:
502
+ self._write_item(findings_file, mapped_finding)
503
+ tracker.existing[finding_key] = True
504
+ tracker.new_count += 1
505
+ tracker.total_count += 1
506
+ findings_added += 1
507
+ else:
508
+ logger.debug(f"Finding with key {finding_key} already exists, skipping")
509
+
510
+ if findings_added > 0:
511
+ logger.info(f"Added {findings_added} new findings from file {file}")
512
+
513
+ def _map_item(self, item_dict: Dict[str, Any], mapping_key: str, item_class: Type) -> Any:
514
+ """
515
+ Apply mapping to an item dictionary if enabled.
516
+
517
+ :param Dict[str, Any] item_dict: Dictionary of item data
518
+ :param str mapping_key: Key in the mapping configuration to use (e.g., 'asset_mapping')
519
+ :param Type item_class: Class to instantiate with mapped data (IntegrationAsset or IntegrationFinding)
520
+ :return: Instantiated item object with mapped data
521
+ :rtype: Any
522
+ """
523
+ if not self.disable_mapping and self.mapping and hasattr(self.mapping, "fields"):
524
+ mapped_dict = self._apply_mapping(
525
+ item_dict, item_dict, getattr(self.mapping, "fields", {}).get(mapping_key, {})
526
+ )
527
+ return item_class(**mapped_dict)
528
+ return item_class(**item_dict)
529
+
530
+ def _write_item(self, file_handle: Any, item: Any) -> None:
531
+ """
532
+ Write an item to the specified file handle.
533
+
534
+ :param Any file_handle: Open file handle to write to
535
+ :param Any item: Item object to write (IntegrationAsset or IntegrationFinding)
536
+ :rtype: None
537
+ """
538
+ file_handle.write(json.dumps(dataclasses.asdict(item)) + "\n")
539
+ file_handle.flush()
540
+
541
+ def _log_processing_results(self, new_count: int, output_file: str, item_type: str) -> None:
542
+ """
543
+ Log the results of processing items.
544
+
545
+ :param int new_count: Number of new items added
546
+ :param str output_file: Path to the output file
547
+ :param str item_type: Type of items processed ('assets' or 'findings')
548
+ :rtype: None
549
+ """
550
+ logger.info(f"Added {new_count} new {item_type} to {output_file}")
551
+
552
+ def _validate_file_path(self, file_path: Optional[str]) -> str:
553
+ """
554
+ Validates the file path and raises an exception if it's invalid.
555
+
556
+ :param Optional[str] file_path: Path to validate
557
+ :return: The validated file path
558
+ :rtype: str
559
+ :raises ValidationException: If the file path is invalid
560
+ """
561
+ if not file_path:
562
+ logger.error("No file path provided")
563
+ raise ValidationException("File path is required")
564
+
565
+ if not is_s3_path(file_path) and not os.path.exists(file_path):
566
+ logger.error(f"File path does not exist: {file_path}")
567
+ raise ValidationException(f"Path does not exist: {file_path}")
568
+
569
+ return file_path
570
+
571
+ def fetch_assets(self, *args: Any, **kwargs: Any) -> Iterator[IntegrationAsset]:
572
+ """
573
+ Fetches assets from processed source files (local or S3).
574
+
575
+ This method supports both local files/directories and S3 paths.
576
+
577
+ :param str file_path: Path to a source file or directory
578
+ :param bool empty_file: Whether to empty the output file before writing (default: True)
579
+ :param bool process_together: Whether to process assets and findings together (default: False)
580
+ :param bool use_jsonl_file: Whether to use an existing JSONL file instead of processing source files
581
+ (default: False)
582
+ :yields: Iterator[IntegrationAsset]
583
+ """
584
+ logger.info("Starting fetch_assets")
585
+ file_path = kwargs.get("file_path", self.file_path)
586
+ empty_file = kwargs.get("empty_file", True)
587
+ process_together = kwargs.get("process_together", False)
588
+ use_jsonl_file = kwargs.get("use_jsonl_file", False)
589
+
590
+ self.create_artifacts_dir()
591
+
592
+ if use_jsonl_file:
593
+ logger.info(f"Using existing JSONL file: {self.ASSETS_FILE}")
594
+ total_assets = sum(1 for _ in open(self.ASSETS_FILE, "r")) if os.path.exists(self.ASSETS_FILE) else 0
595
+ self.num_assets_to_process = total_assets
596
+ logger.info(f"Found {total_assets} assets in existing JSONL file")
597
+ else:
598
+ file_path = self._validate_file_path(file_path)
599
+ if process_together:
600
+ total_assets, _ = self._process_files(
601
+ file_path,
602
+ self.ASSETS_FILE,
603
+ self.FINDINGS_FILE,
604
+ empty_assets_file=empty_file,
605
+ empty_findings_file=False,
606
+ )
607
+ self.num_assets_to_process = total_assets
608
+ else:
609
+ total_assets = self._write_items_to_jsonl(file_path, self.ASSETS_FILE, "asset", empty_file=empty_file)
610
+ self.num_assets_to_process = total_assets
611
+ logger.info(f"Total assets to process: {total_assets}")
612
+
613
+ for asset in self._yield_items_from_jsonl(self.ASSETS_FILE, IntegrationAsset):
614
+ yield asset
615
+
616
+ logger.info(f"Assets read from JSONL complete. Total assets identified: {self.num_assets_to_process}")
617
+
618
+ def fetch_findings(self, *args: Any, **kwargs: Any) -> Iterator[IntegrationFinding]:
619
+ """
620
+ Fetches findings from processed source files (local or S3).
621
+
622
+ This method supports both local files/directories and S3 paths.
623
+
624
+ :param str file_path: Path to source file or directory
625
+ :param bool empty_file: Whether to empty the output file before writing (default: True)
626
+ :param bool process_together: Whether to process assets and findings together (default: False)
627
+ :param bool use_jsonl_file: Whether to use an existing JSONL file instead of processing source files (default: False)
628
+ :yields: Iterator[IntegrationFinding]
629
+ """
630
+ logger.info("Starting fetch_findings")
631
+ file_path = kwargs.get("file_path", self.file_path)
632
+ empty_file = kwargs.get("empty_file", True)
633
+ process_together = kwargs.get("process_together", False)
634
+ use_jsonl_file = kwargs.get("use_jsonl_file", False)
635
+
636
+ self.create_artifacts_dir()
637
+
638
+ if use_jsonl_file:
639
+ logger.info(f"Using existing JSONL file: {self.FINDINGS_FILE}")
640
+ total_findings = sum(1 for _ in open(self.FINDINGS_FILE, "r")) if os.path.exists(self.FINDINGS_FILE) else 0
641
+ self.num_findings_to_process = total_findings
642
+ logger.info(f"Found {total_findings} findings in existing JSONL file")
643
+ else:
644
+ file_path = self._validate_file_path(file_path)
645
+ if process_together:
646
+ _, total_findings = self._process_files(
647
+ file_path,
648
+ self.ASSETS_FILE,
649
+ self.FINDINGS_FILE,
650
+ empty_assets_file=False,
651
+ empty_findings_file=empty_file,
652
+ )
653
+ self.num_findings_to_process = total_findings
654
+ else:
655
+ total_findings = self._write_items_to_jsonl(
656
+ file_path, self.FINDINGS_FILE, "finding", empty_file=empty_file
657
+ )
658
+ self.num_findings_to_process = total_findings
659
+ logger.info(f"Total findings to process: {total_findings}")
660
+
661
+ for finding in self._yield_items_from_jsonl(self.FINDINGS_FILE, IntegrationFinding):
662
+ yield finding
663
+
664
+ logger.info(f"Findings read from JSONL complete. Total findings identified: {self.num_findings_to_process}")
665
+
666
+ def fetch_assets_and_findings(
667
+ self, file_path: str = None, empty_files: bool = True
668
+ ) -> Tuple[Iterator[IntegrationAsset], Iterator[IntegrationFinding]]:
669
+ """Process both assets and findings (local or S3) in a single pass and return iterators.
670
+
671
+ This method optimizes the processing by reading each file only once and extracting
672
+ both asset and finding information in a single pass. It returns two iterators,
673
+ one for assets and one for findings.
674
+
675
+ :param str file_path: Path to source file or directory
676
+ :param bool empty_files: Whether to empty both output files before writing (default: True)
677
+ :return: Tuple of (assets_iterator, findings_iterator)
678
+ :rtype: Tuple[Iterator[IntegrationAsset], Iterator[IntegrationFinding]]
679
+ """
680
+ file_path = self._validate_file_path(file_path or self.file_path)
681
+ self.create_artifacts_dir()
682
+
683
+ logger.info("Processing assets and findings together from %s", file_path)
684
+ total_assets, total_findings = self._process_files(
685
+ file_path=file_path,
686
+ assets_output_file=self.ASSETS_FILE,
687
+ findings_output_file=self.FINDINGS_FILE,
688
+ empty_assets_file=empty_files,
689
+ empty_findings_file=empty_files,
690
+ )
691
+
692
+ self.num_assets_to_process = total_assets
693
+ self.num_findings_to_process = total_findings
694
+
695
+ assets_iterator = self._yield_items_from_jsonl(self.ASSETS_FILE, IntegrationAsset)
696
+ findings_iterator = self._yield_items_from_jsonl(self.FINDINGS_FILE, IntegrationFinding)
697
+ return assets_iterator, findings_iterator
698
+
699
+ def sync_assets_and_findings(self) -> None:
700
+ """Process both assets and findings (local or S3) in a single pass and sync to RegScale.
701
+
702
+ This method optimizes the processing by reading each file only once and
703
+ extracting both asset and finding information in a single pass.
704
+
705
+ :param int plan_id: RegScale Security Plan ID
706
+ :param str file_path: Path to source file or directory
707
+ :param bool empty_files: Whether to empty both output files before writing (default: True)
708
+ :rtype: None
709
+ """
710
+ file_path = self._validate_file_path(self.file_path)
711
+ logger.info("Processing assets and findings together from %s", file_path)
712
+ total_assets, total_findings = self._process_files(
713
+ file_path=file_path,
714
+ assets_output_file=self.ASSETS_FILE,
715
+ findings_output_file=self.FINDINGS_FILE,
716
+ empty_assets_file=self.empty_files,
717
+ empty_findings_file=self.empty_files,
718
+ )
719
+
720
+ logger.info("Syncing %d assets to RegScale", total_assets)
721
+ self.sync_assets(
722
+ plan_id=self.plan_id,
723
+ file_path=file_path,
724
+ use_jsonl_file=True,
725
+ asset_count=total_assets,
726
+ )
727
+
728
+ logger.info("Syncing %d findings to RegScale", total_findings)
729
+ self.sync_findings(
730
+ plan_id=self.plan_id,
731
+ file_path=file_path,
732
+ use_jsonl_file=True,
733
+ finding_count=total_findings,
734
+ )
735
+
736
+ logger.info("Assets and findings sync complete")
737
+
738
+ # Abstract method with default implementation for reading files
739
+ def find_valid_files(self, path: Union[Path, str]) -> Iterator[Tuple[Union[Path, str], Dict[str, Any]]]:
740
+ """
741
+ Find all valid source files in the given path and read their contents if read_files_only is True.
742
+
743
+ Subclasses must override this method to customize file validation and data extraction.
744
+
745
+ :param Union[Path, str] path: Path to a file or directory (local or S3 URI)
746
+ :return: Iterator yielding tuples of (file path, validated data)
747
+ :rtype: Iterator[Tuple[Union[Path, str], Dict[str, Any]]]
748
+ """
749
+ for file in find_files(path, self.file_pattern):
750
+ data = self._read_file_content(file)
751
+ if data is not None:
752
+ yield from self._validate_and_yield(file, data)
753
+
754
+ def _read_file_content(self, file: Union[Path, str]) -> Optional[Dict[str, Any]]:
755
+ """
756
+ Read and parse the content of a file based on read_files_only setting.
757
+
758
+ :param Union[Path, str] file: Path to the file to read
759
+ :return: Parsed JSON data or None if reading fails
760
+ :rtype: Optional[Dict[str, Any]]
761
+ """
762
+ try:
763
+ if self.read_files_only:
764
+ return self._read_content_directly(file)
765
+ return self._read_content_with_download(file)
766
+ except json.JSONDecodeError:
767
+ logger.warning(f"File {file} is not valid JSON, skipping")
768
+ return None
769
+ except Exception as e:
770
+ logger.error(f"Error reading file {file}: {str(e)}")
771
+ return None
772
+
773
+ def _read_content_directly(self, file: Union[Path, str]) -> Dict[str, Any]:
774
+ """
775
+ Read file content directly when read_files_only is True.
776
+
777
+ :param Union[Path, str] file: Path to the file
778
+ :return: Parsed JSON data
779
+ :rtype: Dict[str, Any]
780
+ """
781
+ content = read_file(file)
782
+ return json.loads(content) if content else {}
783
+
784
+ def _read_content_with_download(self, file: Union[Path, str]) -> Dict[str, Any]:
785
+ """
786
+ Read file content, downloading from S3 if necessary, when read_files_only is False.
787
+
788
+ :param Union[Path, str] file: Path to the file (local or S3 URI)
789
+ :return: Parsed JSON data
790
+ :rtype: Dict[str, Any]
791
+ """
792
+ if is_s3_path(file):
793
+ temp_dir = Path(tempfile.mkdtemp())
794
+ try:
795
+ s3_parts = file[5:].split("/", 1)
796
+ bucket = s3_parts[0]
797
+ prefix = s3_parts[1] if len(s3_parts) > 1 else ""
798
+ download_from_s3(bucket, prefix, temp_dir, self.aws_profile)
799
+ local_file = temp_dir / os.path.basename(prefix)
800
+ with open(local_file, "r") as f:
801
+ return json.load(f)
802
+ finally:
803
+ shutil.rmtree(temp_dir)
804
+ else:
805
+ with open(file, "r") as f:
806
+ return json.load(f)
807
+
808
+ def _validate_and_yield(
809
+ self, file: Union[Path, str], data: Dict[str, Any]
810
+ ) -> Iterator[Tuple[Union[Path, str], Dict[str, Any]]]:
811
+ """
812
+ Validate file data and yield it if valid.
813
+
814
+ :param Union[Path, str] file: Path to the file
815
+ :param Dict[str, Any] data: Parsed data from the file
816
+ :return: Iterator yielding valid file data tuples
817
+ :rtype: Iterator[Tuple[Union[Path, str], Dict[str, Any]]]
818
+ """
819
+ is_valid, validated_data = self.is_valid_file(data, file)
820
+ if is_valid and validated_data is not None:
821
+ yield file, validated_data
822
+
823
+ def parse_asset(self, file_path: Union[Path, str], data: Dict[str, Any]) -> IntegrationAsset:
824
+ """
825
+ Parse a single asset from source data.
826
+
827
+ Subclasses must implement this method to parse assets from their specific file format.
828
+
829
+ :param Union[Path, str] file_path: Path to the file containing the asset data
830
+ :param Dict[str, Any] data: The parsed data
831
+ :return: IntegrationAsset object
832
+ :rtype: IntegrationAsset
833
+ """
834
+ raise NotImplementedError("Subclasses must implement parse_asset")
835
+
836
+ def parse_finding(self, asset_identifier: str, data: Dict[str, Any], item: Dict[str, Any]) -> IntegrationFinding:
837
+ """Parse a single finding from source data.
838
+
839
+ Subclasses must implement this method to parse findings from their specific file format.
840
+
841
+ :param str asset_identifier: The identifier of the asset this finding belongs to
842
+ :param Dict[str, Any] data: The asset data
843
+ :param Dict[str, Any] item: The finding data
844
+ :return: IntegrationFinding object
845
+ :rtype: IntegrationFinding
846
+ """
847
+ raise NotImplementedError("Subclasses must implement parse_finding")
848
+
849
+ def is_valid_file(self, data: Any, file_path: Union[Path, str]) -> Tuple[bool, Optional[Dict[str, Any]]]:
850
+ """
851
+ Check if the provided data is valid for processing.
852
+
853
+ This default implementation ensures the data is a non-empty dictionary.
854
+ Subclasses should override this to implement specific validation logic.
855
+
856
+ :param Any data: Data parsed from the file to validate
857
+ :param Union[Path, str] file_path: Path to the file being processed
858
+ :return: Tuple of (is_valid, data) where is_valid indicates validity and data is the validated content or None
859
+ :rtype: Tuple[bool, Optional[Dict[str, Any]]]
860
+ """
861
+ if not isinstance(data, dict):
862
+ logger.warning(f"Data is not a dictionary for file {file_path}, skipping")
863
+ return False, None
864
+
865
+ if not data:
866
+ logger.warning(f"Data is an empty dictionary for file {file_path}, skipping")
867
+ return False, None
868
+
869
+ return True, data