regscale-cli 6.16.1.0__py3-none-any.whl → 6.16.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of regscale-cli might be problematic. Click here for more details.
- regscale/__init__.py +1 -1
- regscale/core/app/internal/login.py +1 -1
- regscale/core/app/internal/poam_editor.py +1 -1
- regscale/integrations/commercial/__init__.py +2 -2
- regscale/integrations/commercial/ad.py +1 -1
- regscale/integrations/commercial/grype/__init__.py +3 -0
- regscale/integrations/commercial/grype/commands.py +72 -0
- regscale/integrations/commercial/grype/scanner.py +390 -0
- regscale/integrations/commercial/import_all/import_all_cmd.py +2 -2
- regscale/integrations/commercial/opentext/__init__.py +6 -0
- regscale/integrations/commercial/opentext/commands.py +77 -0
- regscale/integrations/commercial/opentext/scanner.py +449 -85
- regscale/integrations/commercial/trivy/__init__.py +5 -0
- regscale/integrations/commercial/trivy/commands.py +74 -0
- regscale/integrations/commercial/trivy/scanner.py +276 -0
- regscale/integrations/commercial/wizv2/utils.py +1 -1
- regscale/integrations/jsonl_scanner_integration.py +869 -0
- regscale/integrations/public/fedramp/fedramp_common.py +4 -4
- regscale/integrations/public/fedramp/inventory_items.py +3 -3
- regscale/integrations/scanner_integration.py +172 -41
- regscale/models/integration_models/cisa_kev_data.json +20 -5
- regscale/models/integration_models/synqly_models/capabilities.json +1 -1
- regscale/models/integration_models/tenable_models/integration.py +42 -7
- regscale/models/regscale_models/regscale_model.py +1 -1
- regscale/models/regscale_models/vulnerability.py +21 -0
- {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.2.0.dist-info}/METADATA +3 -3
- {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.2.0.dist-info}/RECORD +32 -27
- regscale/integrations/commercial/grype.py +0 -165
- regscale/integrations/commercial/opentext/click.py +0 -99
- regscale/integrations/commercial/trivy.py +0 -162
- /regscale/models/integration_models/{flat_file_importer.py → flat_file_importer/__init__.py} +0 -0
- {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.2.0.dist-info}/LICENSE +0 -0
- {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.2.0.dist-info}/WHEEL +0 -0
- {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.2.0.dist-info}/entry_points.txt +0 -0
- {regscale_cli-6.16.1.0.dist-info → regscale_cli-6.16.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,869 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Abstract base class for scanner integrations that use JSONL files for intermediate storage.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import dataclasses
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import shutil
|
|
10
|
+
import tempfile
|
|
11
|
+
from typing import Any, Dict, Iterator, Optional, Union, Tuple, TypeVar, Type, List
|
|
12
|
+
|
|
13
|
+
import boto3
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from regscale.core.app.utils.file_utils import is_s3_path, read_file, find_files, download_from_s3
|
|
17
|
+
from regscale.exceptions import ValidationException
|
|
18
|
+
from regscale.integrations.scanner_integration import IntegrationAsset, IntegrationFinding, ScannerIntegration
|
|
19
|
+
from regscale.models.app_models.mapping import Mapping
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger("regscale")
|
|
22
|
+
|
|
23
|
+
# Define generic types for items that can be written to file
|
|
24
|
+
T = TypeVar("T")
|
|
25
|
+
ItemType = TypeVar("ItemType", IntegrationAsset, IntegrationFinding)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class JSONLScannerIntegration(ScannerIntegration):
|
|
29
|
+
"""
|
|
30
|
+
Abstract base class for scanner integrations that use JSONL files for intermediate storage.
|
|
31
|
+
|
|
32
|
+
This class extends ScannerIntegration to provide common functionality for scanners
|
|
33
|
+
that process source files (local or S3) and store the results in JSONL files before syncing to RegScale.
|
|
34
|
+
Supports reading files directly without downloading when read_files_only is True.
|
|
35
|
+
|
|
36
|
+
Subclasses must implement:
|
|
37
|
+
- find_valid_files: To find valid source files
|
|
38
|
+
- parse_asset: To parse an asset from a source file
|
|
39
|
+
- parse_finding: To parse a finding from a source file
|
|
40
|
+
- is_valid_file: To validate a file before processing
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
# Constants for file paths - subclasses should override these
|
|
44
|
+
ASSETS_FILE = "./artifacts/assets.jsonl"
|
|
45
|
+
FINDINGS_FILE = "./artifacts/findings.jsonl"
|
|
46
|
+
|
|
47
|
+
def __init__(self, *args, **kwargs):
|
|
48
|
+
"""
|
|
49
|
+
Initialize the JSONLScannerIntegration.
|
|
50
|
+
"""
|
|
51
|
+
# Extract S3-related kwargs
|
|
52
|
+
self.s3_bucket = kwargs.pop("s3_bucket", None)
|
|
53
|
+
self.s3_prefix = kwargs.pop("s3_prefix", "")
|
|
54
|
+
self.aws_profile = kwargs.pop("aws_profile", "default")
|
|
55
|
+
|
|
56
|
+
self.plan_id = kwargs.pop("plan_id", None)
|
|
57
|
+
self.file_path = kwargs.pop("file_path", None)
|
|
58
|
+
self.empty_files: bool = True
|
|
59
|
+
self.scan_date = kwargs.pop("scan_date", None)
|
|
60
|
+
self.download_destination = kwargs.pop("destination", None)
|
|
61
|
+
self.file_pattern = kwargs.pop("file_pattern", "*.json")
|
|
62
|
+
self.read_files_only = kwargs.pop("read_files_only", False)
|
|
63
|
+
|
|
64
|
+
# Extract mapping-related kwargs
|
|
65
|
+
self.disable_mapping = kwargs.pop("disable_mapping", False)
|
|
66
|
+
self.mapping_path = kwargs.pop("mapping_path", f"./mappings/{self.__class__.__name__.lower()}/mapping.json")
|
|
67
|
+
self.required_asset_fields = kwargs.pop("required_asset_fields", ["identifier", "name"])
|
|
68
|
+
self.required_finding_fields = kwargs.pop("required_finding_fields", ["asset_identifier", "title", "severity"])
|
|
69
|
+
self.mapping = self._load_mapping() if not self.disable_mapping else None
|
|
70
|
+
|
|
71
|
+
# Initialize parent class
|
|
72
|
+
super().__init__(plan_id=self.plan_id, **kwargs)
|
|
73
|
+
|
|
74
|
+
self.s3_client = None
|
|
75
|
+
if self.s3_bucket and not self.read_files_only:
|
|
76
|
+
try:
|
|
77
|
+
session = boto3.Session(profile_name=self.aws_profile)
|
|
78
|
+
self.s3_client = session.client("s3")
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f"Failed to initialize S3 client with profile {self.aws_profile}: {str(e)}")
|
|
81
|
+
raise ValidationException(f"S3 client initialization failed: {str(e)}")
|
|
82
|
+
|
|
83
|
+
def _load_mapping(self) -> Optional[Mapping]:
|
|
84
|
+
"""Load the mapping configuration from a JSON file."""
|
|
85
|
+
try:
|
|
86
|
+
mapping_file = Path(self.mapping_path)
|
|
87
|
+
if mapping_file.exists():
|
|
88
|
+
with mapping_file.open("r") as f:
|
|
89
|
+
mapping_data = json.load(f)
|
|
90
|
+
return Mapping(**mapping_data)
|
|
91
|
+
logger.debug(f"No mapping file found at {self.mapping_path}, using default mapping")
|
|
92
|
+
return None
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(f"Error loading mapping file {self.mapping_path}: {str(e)}")
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
def _apply_mapping(
|
|
98
|
+
self, source_data: Dict[str, Any], target_fields: Dict[str, Any], mapping: Dict[str, str]
|
|
99
|
+
) -> Dict[str, Any]:
|
|
100
|
+
"""Apply field mapping from source data to target fields."""
|
|
101
|
+
mapped_data = target_fields.copy()
|
|
102
|
+
|
|
103
|
+
if self.disable_mapping or not self.mapping or not hasattr(self.mapping, "fields"):
|
|
104
|
+
return {**source_data, **mapped_data}
|
|
105
|
+
|
|
106
|
+
for target_field, source_field in mapping.items():
|
|
107
|
+
if source_field in source_data:
|
|
108
|
+
mapped_data[target_field] = source_data[source_field]
|
|
109
|
+
elif isinstance(source_field, dict) and "default" in source_field:
|
|
110
|
+
mapped_data[target_field] = source_field["default"]
|
|
111
|
+
|
|
112
|
+
return mapped_data
|
|
113
|
+
|
|
114
|
+
def _validate_fields(self, item: Union[IntegrationAsset, IntegrationFinding], required_fields: list) -> None:
|
|
115
|
+
"""Validate that all required fields are present and non-empty."""
|
|
116
|
+
missing_fields = []
|
|
117
|
+
item_dict = dataclasses.asdict(item)
|
|
118
|
+
|
|
119
|
+
for field in required_fields:
|
|
120
|
+
if field not in item_dict or not item_dict[field]:
|
|
121
|
+
missing_fields.append(field)
|
|
122
|
+
|
|
123
|
+
if missing_fields:
|
|
124
|
+
item_type = "asset" if isinstance(item, IntegrationAsset) else "finding"
|
|
125
|
+
raise ValueError(f"Missing or empty required fields for {item_type}: {', '.join(missing_fields)}")
|
|
126
|
+
|
|
127
|
+
def create_artifacts_dir(self) -> Path:
|
|
128
|
+
"""Create artifacts directory if it doesn't exist."""
|
|
129
|
+
artifacts_dir = Path("./artifacts")
|
|
130
|
+
artifacts_dir.mkdir(exist_ok=True, parents=True)
|
|
131
|
+
return artifacts_dir
|
|
132
|
+
|
|
133
|
+
def _get_item_key(self, item_dict: Dict[str, Any], item_type: str) -> str:
|
|
134
|
+
"""Generate a unique key for an item (asset or finding) dictionary."""
|
|
135
|
+
if item_type == "asset":
|
|
136
|
+
return item_dict.get("identifier", "unknown")
|
|
137
|
+
else: # finding
|
|
138
|
+
asset_id = item_dict.get("asset_identifier", "unknown")
|
|
139
|
+
cve = item_dict.get("cve", "")
|
|
140
|
+
title = item_dict.get("title", "")
|
|
141
|
+
if cve:
|
|
142
|
+
return f"{asset_id}:{cve}"
|
|
143
|
+
return f"{asset_id}:{title}"
|
|
144
|
+
|
|
145
|
+
def _prepare_output_file(self, output_file: str, empty_file: bool, item_type: str) -> Dict[str, bool]:
|
|
146
|
+
"""Prepare output file and load existing records if necessary."""
|
|
147
|
+
existing_items: Dict[str, bool] = {}
|
|
148
|
+
|
|
149
|
+
if empty_file and os.path.exists(output_file):
|
|
150
|
+
logger.info(f"Emptying existing file: {output_file}")
|
|
151
|
+
open(output_file, "w").close()
|
|
152
|
+
elif os.path.exists(output_file) and os.path.getsize(output_file) > 0:
|
|
153
|
+
logger.info(f"Reading existing records from: {output_file}")
|
|
154
|
+
try:
|
|
155
|
+
with open(output_file, "r") as f:
|
|
156
|
+
for line in f:
|
|
157
|
+
try:
|
|
158
|
+
record = json.loads(line.strip())
|
|
159
|
+
key = self._get_item_key(record, item_type)
|
|
160
|
+
existing_items[key] = True
|
|
161
|
+
except json.JSONDecodeError:
|
|
162
|
+
logger.warning(f"Could not parse line in {output_file}")
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.warning(f"Error reading existing records: {str(e)}")
|
|
165
|
+
|
|
166
|
+
return existing_items
|
|
167
|
+
|
|
168
|
+
def _write_items_to_jsonl(
|
|
169
|
+
self,
|
|
170
|
+
file_path: str,
|
|
171
|
+
output_file: str,
|
|
172
|
+
item_type: str,
|
|
173
|
+
empty_file: bool = True,
|
|
174
|
+
) -> int:
|
|
175
|
+
"""
|
|
176
|
+
Process source files (local or S3) and write items (assets or findings) to JSONL.
|
|
177
|
+
|
|
178
|
+
:param str file_path: Path to source file or directory (local or S3 URI)
|
|
179
|
+
:param str output_file: Path to output JSONL file
|
|
180
|
+
:param str item_type: Type of items to process ('asset' or 'finding')
|
|
181
|
+
:param bool empty_file: Whether to empty the output file before writing (default: True)
|
|
182
|
+
:return: Total count of items written
|
|
183
|
+
:rtype: int
|
|
184
|
+
"""
|
|
185
|
+
existing_items = self._prepare_output_file(output_file, empty_file, item_type)
|
|
186
|
+
total_items_count = len(existing_items)
|
|
187
|
+
processed_files = set()
|
|
188
|
+
new_items_count = 0
|
|
189
|
+
|
|
190
|
+
with open(output_file, "a") as output_f:
|
|
191
|
+
for file_data in self.find_valid_files(file_path):
|
|
192
|
+
if isinstance(file_data, tuple) and len(file_data) >= 2:
|
|
193
|
+
file, data = file_data[0], file_data[1]
|
|
194
|
+
else:
|
|
195
|
+
file, data = file_data, None
|
|
196
|
+
|
|
197
|
+
file_str = str(file)
|
|
198
|
+
if file_str in processed_files:
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
processed_files.add(file_str)
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
logger.info(f"Processing file: {file}")
|
|
205
|
+
if item_type == "asset":
|
|
206
|
+
self._process_asset_file(file, data, output_f, existing_items)
|
|
207
|
+
new_items_count += 1
|
|
208
|
+
total_items_count += 1
|
|
209
|
+
else:
|
|
210
|
+
findings_count = self._process_finding_file(file, data, output_f, existing_items)
|
|
211
|
+
new_items_count += findings_count
|
|
212
|
+
total_items_count += findings_count
|
|
213
|
+
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.error(f"Error processing file {file}: {str(e)}")
|
|
216
|
+
|
|
217
|
+
item_type_label = "assets" if item_type == "asset" else "findings"
|
|
218
|
+
logger.info(f"Added {new_items_count} new {item_type_label} to {output_file}")
|
|
219
|
+
return total_items_count
|
|
220
|
+
|
|
221
|
+
def _process_asset_file(self, file, data, output_f, existing_items):
|
|
222
|
+
"""
|
|
223
|
+
Process a single file for assets with mapping and validation.
|
|
224
|
+
|
|
225
|
+
:param file: The file being processed
|
|
226
|
+
:param data: The data from the file
|
|
227
|
+
:param output_f: The output file handle
|
|
228
|
+
:param existing_items: Dictionary of existing items
|
|
229
|
+
:return: Number of assets processed
|
|
230
|
+
:rtype: int
|
|
231
|
+
"""
|
|
232
|
+
asset = self.parse_asset(file, data)
|
|
233
|
+
asset_dict = dataclasses.asdict(asset)
|
|
234
|
+
|
|
235
|
+
if not self.disable_mapping:
|
|
236
|
+
mapped_asset_dict = self._apply_mapping(
|
|
237
|
+
data or {},
|
|
238
|
+
asset_dict,
|
|
239
|
+
getattr(self.mapping, "fields", {}).get("asset_mapping", {}) if self.mapping else {},
|
|
240
|
+
)
|
|
241
|
+
mapped_asset = IntegrationAsset(**mapped_asset_dict)
|
|
242
|
+
else:
|
|
243
|
+
mapped_asset = asset
|
|
244
|
+
|
|
245
|
+
self._validate_fields(mapped_asset, self.required_asset_fields)
|
|
246
|
+
|
|
247
|
+
key = self._get_item_key(dataclasses.asdict(mapped_asset), "asset")
|
|
248
|
+
if key in existing_items:
|
|
249
|
+
logger.debug(f"Asset with identifier {key} already exists, skipping")
|
|
250
|
+
return 0
|
|
251
|
+
|
|
252
|
+
output_f.write(json.dumps(dataclasses.asdict(mapped_asset)) + "\n")
|
|
253
|
+
output_f.flush()
|
|
254
|
+
existing_items[key] = True
|
|
255
|
+
return 1
|
|
256
|
+
|
|
257
|
+
def _process_finding_file(self, file, data, output_f, existing_items):
|
|
258
|
+
"""
|
|
259
|
+
Process a single file for findings with mapping and validation.
|
|
260
|
+
|
|
261
|
+
:param file: The file being processed
|
|
262
|
+
:param data: The data from the file
|
|
263
|
+
:param output_f: The output file handle
|
|
264
|
+
:param existing_items: Dictionary of existing items
|
|
265
|
+
:return: Number of findings processed
|
|
266
|
+
:rtype: int
|
|
267
|
+
"""
|
|
268
|
+
asset = self.parse_asset(file, data)
|
|
269
|
+
identifier = asset.identifier
|
|
270
|
+
findings_data = self._get_findings_data_from_file(data)
|
|
271
|
+
|
|
272
|
+
findings_in_file = 0
|
|
273
|
+
for finding_item in findings_data:
|
|
274
|
+
finding = self.parse_finding(identifier, data, finding_item)
|
|
275
|
+
finding_dict = dataclasses.asdict(finding)
|
|
276
|
+
|
|
277
|
+
if not self.disable_mapping:
|
|
278
|
+
mapped_finding_dict = self._apply_mapping(
|
|
279
|
+
finding_item,
|
|
280
|
+
finding_dict,
|
|
281
|
+
getattr(self.mapping, "fields", {}).get("finding_mapping", {}) if self.mapping else {},
|
|
282
|
+
)
|
|
283
|
+
mapped_finding = IntegrationFinding(**mapped_finding_dict)
|
|
284
|
+
else:
|
|
285
|
+
mapped_finding = finding
|
|
286
|
+
|
|
287
|
+
self._validate_fields(mapped_finding, self.required_finding_fields)
|
|
288
|
+
|
|
289
|
+
key = self._get_item_key(dataclasses.asdict(mapped_finding), "finding")
|
|
290
|
+
if key in existing_items:
|
|
291
|
+
logger.debug(f"Finding with key {key} already exists, skipping")
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
output_f.write(json.dumps(dataclasses.asdict(mapped_finding)) + "\n")
|
|
295
|
+
output_f.flush()
|
|
296
|
+
existing_items[key] = True
|
|
297
|
+
findings_in_file += 1
|
|
298
|
+
|
|
299
|
+
if findings_in_file > 0:
|
|
300
|
+
logger.info(f"Added {findings_in_file} new findings from file {file}")
|
|
301
|
+
return findings_in_file
|
|
302
|
+
|
|
303
|
+
def _get_findings_data_from_file(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
304
|
+
"""
|
|
305
|
+
Extract findings data from file data (default implementation).
|
|
306
|
+
|
|
307
|
+
Subclasses must override this method to extract findings data from their specific file format.
|
|
308
|
+
|
|
309
|
+
:param Dict[str, Any] data: The data from the file
|
|
310
|
+
:return: Iterable of finding items
|
|
311
|
+
"""
|
|
312
|
+
# Default implementation returns an empty list
|
|
313
|
+
# Subclasses must override this method
|
|
314
|
+
return []
|
|
315
|
+
|
|
316
|
+
def _yield_items_from_jsonl(self, jsonl_file: str, item_class: Type[ItemType]) -> Iterator[ItemType]:
|
|
317
|
+
"""
|
|
318
|
+
Read items from JSONL file and yield them one by one.
|
|
319
|
+
|
|
320
|
+
:param str jsonl_file: Path to JSONL file containing items
|
|
321
|
+
:param Type[ItemType] item_class: Class to convert dictionary items to (IntegrationAsset or IntegrationFinding)
|
|
322
|
+
:yields: Items one at a time
|
|
323
|
+
:rtype: Iterator[ItemType]
|
|
324
|
+
"""
|
|
325
|
+
if not os.path.exists(jsonl_file):
|
|
326
|
+
logger.warning(f"JSONL file {jsonl_file} does not exist")
|
|
327
|
+
return
|
|
328
|
+
|
|
329
|
+
logger.info(f"Reading items from {jsonl_file}")
|
|
330
|
+
with open(jsonl_file, "r") as f:
|
|
331
|
+
for line_number, line in enumerate(f, 1):
|
|
332
|
+
try:
|
|
333
|
+
item_dict = json.loads(line.strip())
|
|
334
|
+
yield item_class(**item_dict)
|
|
335
|
+
except json.JSONDecodeError:
|
|
336
|
+
logger.warning(f"Could not parse line {line_number} in {jsonl_file}")
|
|
337
|
+
except Exception as e:
|
|
338
|
+
logger.error(f"Error processing line {line_number} in {jsonl_file}: {str(e)}")
|
|
339
|
+
|
|
340
|
+
def _process_files(
|
|
341
|
+
self,
|
|
342
|
+
file_path: Union[str, Path],
|
|
343
|
+
assets_output_file: str,
|
|
344
|
+
findings_output_file: str,
|
|
345
|
+
empty_assets_file: bool = True,
|
|
346
|
+
empty_findings_file: bool = True,
|
|
347
|
+
) -> Tuple[int, int]:
|
|
348
|
+
"""
|
|
349
|
+
Process files (local or S3) to extract both assets and findings in a single pass.
|
|
350
|
+
|
|
351
|
+
Optimizes file processing by reading each file once to extract asset and finding data.
|
|
352
|
+
|
|
353
|
+
:param Union[str, Path] file_path: Path to source file or directory (local or S3 URI)
|
|
354
|
+
:param str assets_output_file: Path to output JSONL file for assets
|
|
355
|
+
:param str findings_output_file: Path to output JSONL file for findings
|
|
356
|
+
:param bool empty_assets_file: Whether to empty the assets file before writing (default: True)
|
|
357
|
+
:param bool empty_findings_file: Whether to empty the findings file before writing (default: True)
|
|
358
|
+
:return: Tuple of total asset and finding counts
|
|
359
|
+
:rtype: Tuple[int, int]
|
|
360
|
+
"""
|
|
361
|
+
existing_assets = self._prepare_output_file(assets_output_file, empty_assets_file, "asset")
|
|
362
|
+
existing_findings = self._prepare_output_file(findings_output_file, empty_findings_file, "finding")
|
|
363
|
+
|
|
364
|
+
asset_tracker = self._init_tracker(existing_assets)
|
|
365
|
+
finding_tracker = self._init_tracker(existing_findings)
|
|
366
|
+
processed_files = set()
|
|
367
|
+
|
|
368
|
+
with open(assets_output_file, "a") as assets_file, open(findings_output_file, "a") as findings_file:
|
|
369
|
+
for file, data in self._get_valid_file_data(file_path):
|
|
370
|
+
if str(file) in processed_files:
|
|
371
|
+
continue
|
|
372
|
+
|
|
373
|
+
processed_files.add(str(file))
|
|
374
|
+
self._process_file(file, data, assets_file, findings_file, asset_tracker, finding_tracker)
|
|
375
|
+
|
|
376
|
+
self._log_processing_results(asset_tracker.new_count, assets_output_file, "assets")
|
|
377
|
+
self._log_processing_results(finding_tracker.new_count, findings_output_file, "findings")
|
|
378
|
+
return asset_tracker.total_count, finding_tracker.total_count
|
|
379
|
+
|
|
380
|
+
def _init_tracker(self, existing_items: Dict[str, bool]) -> "CountTracker":
|
|
381
|
+
"""
|
|
382
|
+
Initialize a tracker for counting new and total items.
|
|
383
|
+
|
|
384
|
+
:param Dict[str, bool] existing_items: Dictionary of existing item keys
|
|
385
|
+
:return: Tracker object for managing counts
|
|
386
|
+
:rtype: CountTracker
|
|
387
|
+
"""
|
|
388
|
+
from dataclasses import dataclass
|
|
389
|
+
|
|
390
|
+
@dataclass
|
|
391
|
+
class CountTracker:
|
|
392
|
+
existing: Dict[str, bool]
|
|
393
|
+
new_count: int = 0
|
|
394
|
+
total_count: int = 0
|
|
395
|
+
|
|
396
|
+
return CountTracker(existing=existing_items, total_count=len(existing_items))
|
|
397
|
+
|
|
398
|
+
def _get_valid_file_data(
|
|
399
|
+
self, file_path: Union[str, Path]
|
|
400
|
+
) -> Iterator[Tuple[Union[Path, str], Optional[Dict[str, Any]]]]:
|
|
401
|
+
"""
|
|
402
|
+
Yield valid file data from the given path.
|
|
403
|
+
|
|
404
|
+
:param Union[str, Path] file_path: Path to source file or directory (local or S3 URI)
|
|
405
|
+
:return: Iterator yielding tuples of (file path, parsed data)
|
|
406
|
+
:rtype: Iterator[Tuple[Union[Path, str], Optional[Dict[str, Any]]]]
|
|
407
|
+
"""
|
|
408
|
+
for file_data in self.find_valid_files(file_path):
|
|
409
|
+
if isinstance(file_data, tuple) and len(file_data) >= 2:
|
|
410
|
+
yield file_data[0], file_data[1]
|
|
411
|
+
else:
|
|
412
|
+
yield file_data, None
|
|
413
|
+
|
|
414
|
+
def _process_file(
|
|
415
|
+
self,
|
|
416
|
+
file: Union[Path, str],
|
|
417
|
+
data: Optional[Dict[str, Any]],
|
|
418
|
+
assets_file: Any,
|
|
419
|
+
findings_file: Any,
|
|
420
|
+
asset_tracker: "CountTracker",
|
|
421
|
+
finding_tracker: "CountTracker",
|
|
422
|
+
) -> None:
|
|
423
|
+
"""
|
|
424
|
+
Process a single file for assets and findings.
|
|
425
|
+
|
|
426
|
+
:param Union[Path, str] file: Path to the file being processed
|
|
427
|
+
:param Optional[Dict[str, Any]] data: Parsed data from the file
|
|
428
|
+
:param Any assets_file: Open file handle for writing assets
|
|
429
|
+
:param Any findings_file: Open file handle for writing findings
|
|
430
|
+
:param CountTracker asset_tracker: Tracker for asset counts
|
|
431
|
+
:param CountTracker finding_tracker: Tracker for finding counts
|
|
432
|
+
:rtype: None
|
|
433
|
+
"""
|
|
434
|
+
try:
|
|
435
|
+
logger.info(f"Processing file: {file}")
|
|
436
|
+
self._process_asset(file, data, assets_file, asset_tracker)
|
|
437
|
+
self._process_findings(file, data, findings_file, asset_tracker.existing, finding_tracker)
|
|
438
|
+
except Exception as e:
|
|
439
|
+
logger.error(f"Error processing file {file}: {str(e)}")
|
|
440
|
+
|
|
441
|
+
def _process_asset(
|
|
442
|
+
self,
|
|
443
|
+
file: Union[Path, str],
|
|
444
|
+
data: Optional[Dict[str, Any]],
|
|
445
|
+
assets_file: Any,
|
|
446
|
+
tracker: "CountTracker",
|
|
447
|
+
) -> None:
|
|
448
|
+
"""
|
|
449
|
+
Process and write a single asset from file data.
|
|
450
|
+
|
|
451
|
+
:param Union[Path, str] file: Path to the file being processed
|
|
452
|
+
:param Optional[Dict[str, Any]] data: Parsed data from the file
|
|
453
|
+
:param Any assets_file: Open file handle for writing assets
|
|
454
|
+
:param CountTracker tracker: Tracker for asset counts
|
|
455
|
+
:rtype: None
|
|
456
|
+
"""
|
|
457
|
+
asset = self.parse_asset(file, data)
|
|
458
|
+
asset_dict = dataclasses.asdict(asset)
|
|
459
|
+
mapped_asset = self._map_item(asset_dict, "asset_mapping", IntegrationAsset)
|
|
460
|
+
self._validate_fields(mapped_asset, self.required_asset_fields)
|
|
461
|
+
|
|
462
|
+
asset_key = mapped_asset.identifier
|
|
463
|
+
if asset_key not in tracker.existing:
|
|
464
|
+
self._write_item(assets_file, mapped_asset)
|
|
465
|
+
tracker.existing[asset_key] = True
|
|
466
|
+
tracker.new_count += 1
|
|
467
|
+
tracker.total_count += 1
|
|
468
|
+
else:
|
|
469
|
+
logger.debug(f"Asset with identifier {asset_key} already exists, skipping")
|
|
470
|
+
|
|
471
|
+
def _process_findings(
|
|
472
|
+
self,
|
|
473
|
+
file: Union[Path, str],
|
|
474
|
+
data: Optional[Dict[str, Any]],
|
|
475
|
+
findings_file: Any,
|
|
476
|
+
existing_assets: Dict[str, bool],
|
|
477
|
+
tracker: "CountTracker",
|
|
478
|
+
) -> None:
|
|
479
|
+
"""
|
|
480
|
+
Process and write findings from file data.
|
|
481
|
+
|
|
482
|
+
:param Union[Path, str] file: Path to the file being processed
|
|
483
|
+
:param Optional[Dict[str, Any]] data: Parsed data from the file
|
|
484
|
+
:param Any findings_file: Open file handle for writing findings
|
|
485
|
+
:param Dict[str, bool] existing_assets: Dictionary of existing asset keys
|
|
486
|
+
:param CountTracker tracker: Tracker for finding counts
|
|
487
|
+
:rtype: None
|
|
488
|
+
"""
|
|
489
|
+
findings_data = self._get_findings_data_from_file(data)
|
|
490
|
+
logger.info(f"Found {len(findings_data)} findings in file: {file}")
|
|
491
|
+
findings_added = 0
|
|
492
|
+
|
|
493
|
+
asset_id = list(existing_assets.keys())[0] if existing_assets else "unknown"
|
|
494
|
+
for finding_item in findings_data:
|
|
495
|
+
finding = self.parse_finding(asset_id, data, finding_item)
|
|
496
|
+
finding_dict = dataclasses.asdict(finding)
|
|
497
|
+
mapped_finding = self._map_item(finding_dict, "finding_mapping", IntegrationFinding)
|
|
498
|
+
self._validate_fields(mapped_finding, self.required_finding_fields)
|
|
499
|
+
|
|
500
|
+
finding_key = self._get_item_key(dataclasses.asdict(mapped_finding), "finding")
|
|
501
|
+
if finding_key not in tracker.existing:
|
|
502
|
+
self._write_item(findings_file, mapped_finding)
|
|
503
|
+
tracker.existing[finding_key] = True
|
|
504
|
+
tracker.new_count += 1
|
|
505
|
+
tracker.total_count += 1
|
|
506
|
+
findings_added += 1
|
|
507
|
+
else:
|
|
508
|
+
logger.debug(f"Finding with key {finding_key} already exists, skipping")
|
|
509
|
+
|
|
510
|
+
if findings_added > 0:
|
|
511
|
+
logger.info(f"Added {findings_added} new findings from file {file}")
|
|
512
|
+
|
|
513
|
+
def _map_item(self, item_dict: Dict[str, Any], mapping_key: str, item_class: Type) -> Any:
|
|
514
|
+
"""
|
|
515
|
+
Apply mapping to an item dictionary if enabled.
|
|
516
|
+
|
|
517
|
+
:param Dict[str, Any] item_dict: Dictionary of item data
|
|
518
|
+
:param str mapping_key: Key in the mapping configuration to use (e.g., 'asset_mapping')
|
|
519
|
+
:param Type item_class: Class to instantiate with mapped data (IntegrationAsset or IntegrationFinding)
|
|
520
|
+
:return: Instantiated item object with mapped data
|
|
521
|
+
:rtype: Any
|
|
522
|
+
"""
|
|
523
|
+
if not self.disable_mapping and self.mapping and hasattr(self.mapping, "fields"):
|
|
524
|
+
mapped_dict = self._apply_mapping(
|
|
525
|
+
item_dict, item_dict, getattr(self.mapping, "fields", {}).get(mapping_key, {})
|
|
526
|
+
)
|
|
527
|
+
return item_class(**mapped_dict)
|
|
528
|
+
return item_class(**item_dict)
|
|
529
|
+
|
|
530
|
+
def _write_item(self, file_handle: Any, item: Any) -> None:
|
|
531
|
+
"""
|
|
532
|
+
Write an item to the specified file handle.
|
|
533
|
+
|
|
534
|
+
:param Any file_handle: Open file handle to write to
|
|
535
|
+
:param Any item: Item object to write (IntegrationAsset or IntegrationFinding)
|
|
536
|
+
:rtype: None
|
|
537
|
+
"""
|
|
538
|
+
file_handle.write(json.dumps(dataclasses.asdict(item)) + "\n")
|
|
539
|
+
file_handle.flush()
|
|
540
|
+
|
|
541
|
+
def _log_processing_results(self, new_count: int, output_file: str, item_type: str) -> None:
|
|
542
|
+
"""
|
|
543
|
+
Log the results of processing items.
|
|
544
|
+
|
|
545
|
+
:param int new_count: Number of new items added
|
|
546
|
+
:param str output_file: Path to the output file
|
|
547
|
+
:param str item_type: Type of items processed ('assets' or 'findings')
|
|
548
|
+
:rtype: None
|
|
549
|
+
"""
|
|
550
|
+
logger.info(f"Added {new_count} new {item_type} to {output_file}")
|
|
551
|
+
|
|
552
|
+
def _validate_file_path(self, file_path: Optional[str]) -> str:
|
|
553
|
+
"""
|
|
554
|
+
Validates the file path and raises an exception if it's invalid.
|
|
555
|
+
|
|
556
|
+
:param Optional[str] file_path: Path to validate
|
|
557
|
+
:return: The validated file path
|
|
558
|
+
:rtype: str
|
|
559
|
+
:raises ValidationException: If the file path is invalid
|
|
560
|
+
"""
|
|
561
|
+
if not file_path:
|
|
562
|
+
logger.error("No file path provided")
|
|
563
|
+
raise ValidationException("File path is required")
|
|
564
|
+
|
|
565
|
+
if not is_s3_path(file_path) and not os.path.exists(file_path):
|
|
566
|
+
logger.error(f"File path does not exist: {file_path}")
|
|
567
|
+
raise ValidationException(f"Path does not exist: {file_path}")
|
|
568
|
+
|
|
569
|
+
return file_path
|
|
570
|
+
|
|
571
|
+
def fetch_assets(self, *args: Any, **kwargs: Any) -> Iterator[IntegrationAsset]:
|
|
572
|
+
"""
|
|
573
|
+
Fetches assets from processed source files (local or S3).
|
|
574
|
+
|
|
575
|
+
This method supports both local files/directories and S3 paths.
|
|
576
|
+
|
|
577
|
+
:param str file_path: Path to a source file or directory
|
|
578
|
+
:param bool empty_file: Whether to empty the output file before writing (default: True)
|
|
579
|
+
:param bool process_together: Whether to process assets and findings together (default: False)
|
|
580
|
+
:param bool use_jsonl_file: Whether to use an existing JSONL file instead of processing source files
|
|
581
|
+
(default: False)
|
|
582
|
+
:yields: Iterator[IntegrationAsset]
|
|
583
|
+
"""
|
|
584
|
+
logger.info("Starting fetch_assets")
|
|
585
|
+
file_path = kwargs.get("file_path", self.file_path)
|
|
586
|
+
empty_file = kwargs.get("empty_file", True)
|
|
587
|
+
process_together = kwargs.get("process_together", False)
|
|
588
|
+
use_jsonl_file = kwargs.get("use_jsonl_file", False)
|
|
589
|
+
|
|
590
|
+
self.create_artifacts_dir()
|
|
591
|
+
|
|
592
|
+
if use_jsonl_file:
|
|
593
|
+
logger.info(f"Using existing JSONL file: {self.ASSETS_FILE}")
|
|
594
|
+
total_assets = sum(1 for _ in open(self.ASSETS_FILE, "r")) if os.path.exists(self.ASSETS_FILE) else 0
|
|
595
|
+
self.num_assets_to_process = total_assets
|
|
596
|
+
logger.info(f"Found {total_assets} assets in existing JSONL file")
|
|
597
|
+
else:
|
|
598
|
+
file_path = self._validate_file_path(file_path)
|
|
599
|
+
if process_together:
|
|
600
|
+
total_assets, _ = self._process_files(
|
|
601
|
+
file_path,
|
|
602
|
+
self.ASSETS_FILE,
|
|
603
|
+
self.FINDINGS_FILE,
|
|
604
|
+
empty_assets_file=empty_file,
|
|
605
|
+
empty_findings_file=False,
|
|
606
|
+
)
|
|
607
|
+
self.num_assets_to_process = total_assets
|
|
608
|
+
else:
|
|
609
|
+
total_assets = self._write_items_to_jsonl(file_path, self.ASSETS_FILE, "asset", empty_file=empty_file)
|
|
610
|
+
self.num_assets_to_process = total_assets
|
|
611
|
+
logger.info(f"Total assets to process: {total_assets}")
|
|
612
|
+
|
|
613
|
+
for asset in self._yield_items_from_jsonl(self.ASSETS_FILE, IntegrationAsset):
|
|
614
|
+
yield asset
|
|
615
|
+
|
|
616
|
+
logger.info(f"Assets read from JSONL complete. Total assets identified: {self.num_assets_to_process}")
|
|
617
|
+
|
|
618
|
+
def fetch_findings(self, *args: Any, **kwargs: Any) -> Iterator[IntegrationFinding]:
|
|
619
|
+
"""
|
|
620
|
+
Fetches findings from processed source files (local or S3).
|
|
621
|
+
|
|
622
|
+
This method supports both local files/directories and S3 paths.
|
|
623
|
+
|
|
624
|
+
:param str file_path: Path to source file or directory
|
|
625
|
+
:param bool empty_file: Whether to empty the output file before writing (default: True)
|
|
626
|
+
:param bool process_together: Whether to process assets and findings together (default: False)
|
|
627
|
+
:param bool use_jsonl_file: Whether to use an existing JSONL file instead of processing source files (default: False)
|
|
628
|
+
:yields: Iterator[IntegrationFinding]
|
|
629
|
+
"""
|
|
630
|
+
logger.info("Starting fetch_findings")
|
|
631
|
+
file_path = kwargs.get("file_path", self.file_path)
|
|
632
|
+
empty_file = kwargs.get("empty_file", True)
|
|
633
|
+
process_together = kwargs.get("process_together", False)
|
|
634
|
+
use_jsonl_file = kwargs.get("use_jsonl_file", False)
|
|
635
|
+
|
|
636
|
+
self.create_artifacts_dir()
|
|
637
|
+
|
|
638
|
+
if use_jsonl_file:
|
|
639
|
+
logger.info(f"Using existing JSONL file: {self.FINDINGS_FILE}")
|
|
640
|
+
total_findings = sum(1 for _ in open(self.FINDINGS_FILE, "r")) if os.path.exists(self.FINDINGS_FILE) else 0
|
|
641
|
+
self.num_findings_to_process = total_findings
|
|
642
|
+
logger.info(f"Found {total_findings} findings in existing JSONL file")
|
|
643
|
+
else:
|
|
644
|
+
file_path = self._validate_file_path(file_path)
|
|
645
|
+
if process_together:
|
|
646
|
+
_, total_findings = self._process_files(
|
|
647
|
+
file_path,
|
|
648
|
+
self.ASSETS_FILE,
|
|
649
|
+
self.FINDINGS_FILE,
|
|
650
|
+
empty_assets_file=False,
|
|
651
|
+
empty_findings_file=empty_file,
|
|
652
|
+
)
|
|
653
|
+
self.num_findings_to_process = total_findings
|
|
654
|
+
else:
|
|
655
|
+
total_findings = self._write_items_to_jsonl(
|
|
656
|
+
file_path, self.FINDINGS_FILE, "finding", empty_file=empty_file
|
|
657
|
+
)
|
|
658
|
+
self.num_findings_to_process = total_findings
|
|
659
|
+
logger.info(f"Total findings to process: {total_findings}")
|
|
660
|
+
|
|
661
|
+
for finding in self._yield_items_from_jsonl(self.FINDINGS_FILE, IntegrationFinding):
|
|
662
|
+
yield finding
|
|
663
|
+
|
|
664
|
+
logger.info(f"Findings read from JSONL complete. Total findings identified: {self.num_findings_to_process}")
|
|
665
|
+
|
|
666
|
+
def fetch_assets_and_findings(
|
|
667
|
+
self, file_path: str = None, empty_files: bool = True
|
|
668
|
+
) -> Tuple[Iterator[IntegrationAsset], Iterator[IntegrationFinding]]:
|
|
669
|
+
"""Process both assets and findings (local or S3) in a single pass and return iterators.
|
|
670
|
+
|
|
671
|
+
This method optimizes the processing by reading each file only once and extracting
|
|
672
|
+
both asset and finding information in a single pass. It returns two iterators,
|
|
673
|
+
one for assets and one for findings.
|
|
674
|
+
|
|
675
|
+
:param str file_path: Path to source file or directory
|
|
676
|
+
:param bool empty_files: Whether to empty both output files before writing (default: True)
|
|
677
|
+
:return: Tuple of (assets_iterator, findings_iterator)
|
|
678
|
+
:rtype: Tuple[Iterator[IntegrationAsset], Iterator[IntegrationFinding]]
|
|
679
|
+
"""
|
|
680
|
+
file_path = self._validate_file_path(file_path or self.file_path)
|
|
681
|
+
self.create_artifacts_dir()
|
|
682
|
+
|
|
683
|
+
logger.info("Processing assets and findings together from %s", file_path)
|
|
684
|
+
total_assets, total_findings = self._process_files(
|
|
685
|
+
file_path=file_path,
|
|
686
|
+
assets_output_file=self.ASSETS_FILE,
|
|
687
|
+
findings_output_file=self.FINDINGS_FILE,
|
|
688
|
+
empty_assets_file=empty_files,
|
|
689
|
+
empty_findings_file=empty_files,
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
self.num_assets_to_process = total_assets
|
|
693
|
+
self.num_findings_to_process = total_findings
|
|
694
|
+
|
|
695
|
+
assets_iterator = self._yield_items_from_jsonl(self.ASSETS_FILE, IntegrationAsset)
|
|
696
|
+
findings_iterator = self._yield_items_from_jsonl(self.FINDINGS_FILE, IntegrationFinding)
|
|
697
|
+
return assets_iterator, findings_iterator
|
|
698
|
+
|
|
699
|
+
def sync_assets_and_findings(self) -> None:
|
|
700
|
+
"""Process both assets and findings (local or S3) in a single pass and sync to RegScale.
|
|
701
|
+
|
|
702
|
+
This method optimizes the processing by reading each file only once and
|
|
703
|
+
extracting both asset and finding information in a single pass.
|
|
704
|
+
|
|
705
|
+
:param int plan_id: RegScale Security Plan ID
|
|
706
|
+
:param str file_path: Path to source file or directory
|
|
707
|
+
:param bool empty_files: Whether to empty both output files before writing (default: True)
|
|
708
|
+
:rtype: None
|
|
709
|
+
"""
|
|
710
|
+
file_path = self._validate_file_path(self.file_path)
|
|
711
|
+
logger.info("Processing assets and findings together from %s", file_path)
|
|
712
|
+
total_assets, total_findings = self._process_files(
|
|
713
|
+
file_path=file_path,
|
|
714
|
+
assets_output_file=self.ASSETS_FILE,
|
|
715
|
+
findings_output_file=self.FINDINGS_FILE,
|
|
716
|
+
empty_assets_file=self.empty_files,
|
|
717
|
+
empty_findings_file=self.empty_files,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
logger.info("Syncing %d assets to RegScale", total_assets)
|
|
721
|
+
self.sync_assets(
|
|
722
|
+
plan_id=self.plan_id,
|
|
723
|
+
file_path=file_path,
|
|
724
|
+
use_jsonl_file=True,
|
|
725
|
+
asset_count=total_assets,
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
logger.info("Syncing %d findings to RegScale", total_findings)
|
|
729
|
+
self.sync_findings(
|
|
730
|
+
plan_id=self.plan_id,
|
|
731
|
+
file_path=file_path,
|
|
732
|
+
use_jsonl_file=True,
|
|
733
|
+
finding_count=total_findings,
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
logger.info("Assets and findings sync complete")
|
|
737
|
+
|
|
738
|
+
# Abstract method with default implementation for reading files
|
|
739
|
+
def find_valid_files(self, path: Union[Path, str]) -> Iterator[Tuple[Union[Path, str], Dict[str, Any]]]:
|
|
740
|
+
"""
|
|
741
|
+
Find all valid source files in the given path and read their contents if read_files_only is True.
|
|
742
|
+
|
|
743
|
+
Subclasses must override this method to customize file validation and data extraction.
|
|
744
|
+
|
|
745
|
+
:param Union[Path, str] path: Path to a file or directory (local or S3 URI)
|
|
746
|
+
:return: Iterator yielding tuples of (file path, validated data)
|
|
747
|
+
:rtype: Iterator[Tuple[Union[Path, str], Dict[str, Any]]]
|
|
748
|
+
"""
|
|
749
|
+
for file in find_files(path, self.file_pattern):
|
|
750
|
+
data = self._read_file_content(file)
|
|
751
|
+
if data is not None:
|
|
752
|
+
yield from self._validate_and_yield(file, data)
|
|
753
|
+
|
|
754
|
+
def _read_file_content(self, file: Union[Path, str]) -> Optional[Dict[str, Any]]:
|
|
755
|
+
"""
|
|
756
|
+
Read and parse the content of a file based on read_files_only setting.
|
|
757
|
+
|
|
758
|
+
:param Union[Path, str] file: Path to the file to read
|
|
759
|
+
:return: Parsed JSON data or None if reading fails
|
|
760
|
+
:rtype: Optional[Dict[str, Any]]
|
|
761
|
+
"""
|
|
762
|
+
try:
|
|
763
|
+
if self.read_files_only:
|
|
764
|
+
return self._read_content_directly(file)
|
|
765
|
+
return self._read_content_with_download(file)
|
|
766
|
+
except json.JSONDecodeError:
|
|
767
|
+
logger.warning(f"File {file} is not valid JSON, skipping")
|
|
768
|
+
return None
|
|
769
|
+
except Exception as e:
|
|
770
|
+
logger.error(f"Error reading file {file}: {str(e)}")
|
|
771
|
+
return None
|
|
772
|
+
|
|
773
|
+
def _read_content_directly(self, file: Union[Path, str]) -> Dict[str, Any]:
|
|
774
|
+
"""
|
|
775
|
+
Read file content directly when read_files_only is True.
|
|
776
|
+
|
|
777
|
+
:param Union[Path, str] file: Path to the file
|
|
778
|
+
:return: Parsed JSON data
|
|
779
|
+
:rtype: Dict[str, Any]
|
|
780
|
+
"""
|
|
781
|
+
content = read_file(file)
|
|
782
|
+
return json.loads(content) if content else {}
|
|
783
|
+
|
|
784
|
+
def _read_content_with_download(self, file: Union[Path, str]) -> Dict[str, Any]:
|
|
785
|
+
"""
|
|
786
|
+
Read file content, downloading from S3 if necessary, when read_files_only is False.
|
|
787
|
+
|
|
788
|
+
:param Union[Path, str] file: Path to the file (local or S3 URI)
|
|
789
|
+
:return: Parsed JSON data
|
|
790
|
+
:rtype: Dict[str, Any]
|
|
791
|
+
"""
|
|
792
|
+
if is_s3_path(file):
|
|
793
|
+
temp_dir = Path(tempfile.mkdtemp())
|
|
794
|
+
try:
|
|
795
|
+
s3_parts = file[5:].split("/", 1)
|
|
796
|
+
bucket = s3_parts[0]
|
|
797
|
+
prefix = s3_parts[1] if len(s3_parts) > 1 else ""
|
|
798
|
+
download_from_s3(bucket, prefix, temp_dir, self.aws_profile)
|
|
799
|
+
local_file = temp_dir / os.path.basename(prefix)
|
|
800
|
+
with open(local_file, "r") as f:
|
|
801
|
+
return json.load(f)
|
|
802
|
+
finally:
|
|
803
|
+
shutil.rmtree(temp_dir)
|
|
804
|
+
else:
|
|
805
|
+
with open(file, "r") as f:
|
|
806
|
+
return json.load(f)
|
|
807
|
+
|
|
808
|
+
def _validate_and_yield(
|
|
809
|
+
self, file: Union[Path, str], data: Dict[str, Any]
|
|
810
|
+
) -> Iterator[Tuple[Union[Path, str], Dict[str, Any]]]:
|
|
811
|
+
"""
|
|
812
|
+
Validate file data and yield it if valid.
|
|
813
|
+
|
|
814
|
+
:param Union[Path, str] file: Path to the file
|
|
815
|
+
:param Dict[str, Any] data: Parsed data from the file
|
|
816
|
+
:return: Iterator yielding valid file data tuples
|
|
817
|
+
:rtype: Iterator[Tuple[Union[Path, str], Dict[str, Any]]]
|
|
818
|
+
"""
|
|
819
|
+
is_valid, validated_data = self.is_valid_file(data, file)
|
|
820
|
+
if is_valid and validated_data is not None:
|
|
821
|
+
yield file, validated_data
|
|
822
|
+
|
|
823
|
+
def parse_asset(self, file_path: Union[Path, str], data: Dict[str, Any]) -> IntegrationAsset:
|
|
824
|
+
"""
|
|
825
|
+
Parse a single asset from source data.
|
|
826
|
+
|
|
827
|
+
Subclasses must implement this method to parse assets from their specific file format.
|
|
828
|
+
|
|
829
|
+
:param Union[Path, str] file_path: Path to the file containing the asset data
|
|
830
|
+
:param Dict[str, Any] data: The parsed data
|
|
831
|
+
:return: IntegrationAsset object
|
|
832
|
+
:rtype: IntegrationAsset
|
|
833
|
+
"""
|
|
834
|
+
raise NotImplementedError("Subclasses must implement parse_asset")
|
|
835
|
+
|
|
836
|
+
def parse_finding(self, asset_identifier: str, data: Dict[str, Any], item: Dict[str, Any]) -> IntegrationFinding:
|
|
837
|
+
"""Parse a single finding from source data.
|
|
838
|
+
|
|
839
|
+
Subclasses must implement this method to parse findings from their specific file format.
|
|
840
|
+
|
|
841
|
+
:param str asset_identifier: The identifier of the asset this finding belongs to
|
|
842
|
+
:param Dict[str, Any] data: The asset data
|
|
843
|
+
:param Dict[str, Any] item: The finding data
|
|
844
|
+
:return: IntegrationFinding object
|
|
845
|
+
:rtype: IntegrationFinding
|
|
846
|
+
"""
|
|
847
|
+
raise NotImplementedError("Subclasses must implement parse_finding")
|
|
848
|
+
|
|
849
|
+
def is_valid_file(self, data: Any, file_path: Union[Path, str]) -> Tuple[bool, Optional[Dict[str, Any]]]:
|
|
850
|
+
"""
|
|
851
|
+
Check if the provided data is valid for processing.
|
|
852
|
+
|
|
853
|
+
This default implementation ensures the data is a non-empty dictionary.
|
|
854
|
+
Subclasses should override this to implement specific validation logic.
|
|
855
|
+
|
|
856
|
+
:param Any data: Data parsed from the file to validate
|
|
857
|
+
:param Union[Path, str] file_path: Path to the file being processed
|
|
858
|
+
:return: Tuple of (is_valid, data) where is_valid indicates validity and data is the validated content or None
|
|
859
|
+
:rtype: Tuple[bool, Optional[Dict[str, Any]]]
|
|
860
|
+
"""
|
|
861
|
+
if not isinstance(data, dict):
|
|
862
|
+
logger.warning(f"Data is not a dictionary for file {file_path}, skipping")
|
|
863
|
+
return False, None
|
|
864
|
+
|
|
865
|
+
if not data:
|
|
866
|
+
logger.warning(f"Data is an empty dictionary for file {file_path}, skipping")
|
|
867
|
+
return False, None
|
|
868
|
+
|
|
869
|
+
return True, data
|