classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,620 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections.abc import AsyncGenerator
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ...models.generated_input import (
|
|
7
|
+
SamplingStrategy,
|
|
8
|
+
ServiceDeskInput,
|
|
9
|
+
ServiceDeskOptional,
|
|
10
|
+
ServiceDeskOptionalConnection,
|
|
11
|
+
ServiceDeskOptionalContent,
|
|
12
|
+
)
|
|
13
|
+
from ...models.generated_single_asset_scan_results import (
|
|
14
|
+
AssetType as OutputAssetType,
|
|
15
|
+
)
|
|
16
|
+
from ...models.generated_single_asset_scan_results import (
|
|
17
|
+
DetectionResult,
|
|
18
|
+
Location,
|
|
19
|
+
SingleAssetScanResults,
|
|
20
|
+
)
|
|
21
|
+
from ...utils.file_parser import resolve_mime_type
|
|
22
|
+
from ...utils.hashing import hash_url, normalize_http_url
|
|
23
|
+
from ..atlassian_common import (
|
|
24
|
+
AtlassianCloudClient,
|
|
25
|
+
dedupe_preserve_order,
|
|
26
|
+
deterministic_sample,
|
|
27
|
+
extract_urls_from_text,
|
|
28
|
+
is_tabular_filename,
|
|
29
|
+
is_tabular_mime_type,
|
|
30
|
+
json_dumps,
|
|
31
|
+
normalize_atlassian_base_url,
|
|
32
|
+
parse_atlassian_document,
|
|
33
|
+
parse_datetime,
|
|
34
|
+
)
|
|
35
|
+
from ..base import BaseSource
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ServiceDeskSource(BaseSource):
|
|
41
|
+
source_type = "servicedesk"
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
recipe: dict[str, Any],
|
|
46
|
+
source_id: str | None = None,
|
|
47
|
+
runner_id: str | None = None,
|
|
48
|
+
):
|
|
49
|
+
super().__init__(recipe, source_id=source_id, runner_id=runner_id)
|
|
50
|
+
self.config = ServiceDeskInput.model_validate(recipe)
|
|
51
|
+
self.runner_id = runner_id or "local-run"
|
|
52
|
+
|
|
53
|
+
self.base_url = normalize_atlassian_base_url(str(self.config.required.base_url))
|
|
54
|
+
connection = self._connection_options()
|
|
55
|
+
self.client = AtlassianCloudClient(
|
|
56
|
+
base_url=self.base_url,
|
|
57
|
+
account_email=str(self.config.required.account_email),
|
|
58
|
+
api_token=self.config.masked.api_token,
|
|
59
|
+
request_timeout_seconds=float(connection.request_timeout_seconds or 30),
|
|
60
|
+
max_retries=int(connection.max_retries or 3),
|
|
61
|
+
rate_limit_delay_seconds=float(connection.rate_limit_delay_seconds or 0),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
content_options = self._content_options()
|
|
65
|
+
self.include_comments = content_options.include_comments is not False
|
|
66
|
+
self.include_attachments = content_options.include_attachments is not False
|
|
67
|
+
self.attachment_max_bytes = int(content_options.attachment_max_bytes or 5_242_880)
|
|
68
|
+
|
|
69
|
+
self._seen_asset_hashes: set[str] = set()
|
|
70
|
+
self._hash_to_url: dict[str, str] = {}
|
|
71
|
+
self._asset_content_cache: dict[str, tuple[str, str]] = {}
|
|
72
|
+
self._attachment_url_by_hash: dict[str, str] = {}
|
|
73
|
+
|
|
74
|
+
def _optional(self) -> ServiceDeskOptional:
|
|
75
|
+
if self.config.optional:
|
|
76
|
+
return self.config.optional
|
|
77
|
+
return ServiceDeskOptional()
|
|
78
|
+
|
|
79
|
+
def _connection_options(self) -> ServiceDeskOptionalConnection:
|
|
80
|
+
optional = self._optional()
|
|
81
|
+
if optional.connection:
|
|
82
|
+
return optional.connection
|
|
83
|
+
return ServiceDeskOptionalConnection()
|
|
84
|
+
|
|
85
|
+
def _content_options(self) -> ServiceDeskOptionalContent:
|
|
86
|
+
optional = self._optional()
|
|
87
|
+
if optional.content:
|
|
88
|
+
return optional.content
|
|
89
|
+
return ServiceDeskOptionalContent()
|
|
90
|
+
|
|
91
|
+
def test_connection(self) -> dict[str, Any]:
|
|
92
|
+
result = {
|
|
93
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
94
|
+
"source_type": self.recipe.get("type"),
|
|
95
|
+
}
|
|
96
|
+
try:
|
|
97
|
+
self.client.get_json(
|
|
98
|
+
"/rest/servicedeskapi/servicedesk", params={"start": 0, "limit": 1}
|
|
99
|
+
)
|
|
100
|
+
result["status"] = "SUCCESS"
|
|
101
|
+
result["message"] = "Successfully connected to Jira Service Management API."
|
|
102
|
+
except Exception as exc:
|
|
103
|
+
result["status"] = "FAILURE"
|
|
104
|
+
result["message"] = f"Failed to connect to Jira Service Management API: {exc}"
|
|
105
|
+
return result
|
|
106
|
+
|
|
107
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
108
|
+
if self._aborted:
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
self._reset_runtime_state()
|
|
112
|
+
|
|
113
|
+
requests = self._fetch_requests()
|
|
114
|
+
sampled_requests = self._sample_requests(requests)
|
|
115
|
+
|
|
116
|
+
pending_batch: list[SingleAssetScanResults] = []
|
|
117
|
+
for request in sampled_requests:
|
|
118
|
+
if self._aborted:
|
|
119
|
+
break
|
|
120
|
+
assets = self._extract_request_assets(request)
|
|
121
|
+
for asset in assets:
|
|
122
|
+
if not self._add_asset_if_new(pending_batch, asset):
|
|
123
|
+
continue
|
|
124
|
+
while len(pending_batch) >= self.BATCH_SIZE:
|
|
125
|
+
to_emit = pending_batch[: self.BATCH_SIZE]
|
|
126
|
+
pending_batch = pending_batch[self.BATCH_SIZE :]
|
|
127
|
+
if to_emit:
|
|
128
|
+
yield to_emit
|
|
129
|
+
|
|
130
|
+
if pending_batch:
|
|
131
|
+
yield pending_batch
|
|
132
|
+
|
|
133
|
+
def _reset_runtime_state(self) -> None:
|
|
134
|
+
self._seen_asset_hashes = set()
|
|
135
|
+
self._hash_to_url = {}
|
|
136
|
+
self._asset_content_cache = {}
|
|
137
|
+
self._attachment_url_by_hash = {}
|
|
138
|
+
self._attachment_name_by_hash = {}
|
|
139
|
+
|
|
140
|
+
def _fetch_requests(self) -> list[dict[str, Any]]:
|
|
141
|
+
scope = self._optional().scope
|
|
142
|
+
base_filters: dict[str, Any] = {}
|
|
143
|
+
if scope:
|
|
144
|
+
if scope.search_term:
|
|
145
|
+
base_filters["searchTerm"] = str(scope.search_term)
|
|
146
|
+
if scope.request_status:
|
|
147
|
+
base_filters["requestStatus"] = str(scope.request_status)
|
|
148
|
+
if scope.request_ownership:
|
|
149
|
+
ownership = [
|
|
150
|
+
str(value).strip() for value in scope.request_ownership if str(value).strip()
|
|
151
|
+
]
|
|
152
|
+
if ownership:
|
|
153
|
+
base_filters["requestOwnership"] = ownership
|
|
154
|
+
if scope.organization_id is not None:
|
|
155
|
+
base_filters["organizationId"] = int(scope.organization_id)
|
|
156
|
+
|
|
157
|
+
service_desk_ids = [int(value) for value in (scope.service_desk_ids or [])] if scope else []
|
|
158
|
+
request_type_ids = [int(value) for value in (scope.request_type_ids or [])] if scope else []
|
|
159
|
+
|
|
160
|
+
result_by_key: dict[str, dict[str, Any]] = {}
|
|
161
|
+
|
|
162
|
+
for filters in self._request_filter_combinations(
|
|
163
|
+
base_filters, service_desk_ids, request_type_ids
|
|
164
|
+
):
|
|
165
|
+
for item in self.client.iter_servicedesk_values(
|
|
166
|
+
"/rest/servicedeskapi/request",
|
|
167
|
+
params=filters,
|
|
168
|
+
limit=50,
|
|
169
|
+
):
|
|
170
|
+
key = str(item.get("issueKey") or item.get("issueId") or "")
|
|
171
|
+
if key:
|
|
172
|
+
result_by_key[key] = item
|
|
173
|
+
|
|
174
|
+
return list(result_by_key.values())
|
|
175
|
+
|
|
176
|
+
def _request_filter_combinations(
|
|
177
|
+
self,
|
|
178
|
+
base_filters: dict[str, Any],
|
|
179
|
+
service_desk_ids: list[int],
|
|
180
|
+
request_type_ids: list[int],
|
|
181
|
+
) -> list[dict[str, Any]]:
|
|
182
|
+
combinations: list[dict[str, Any]] = []
|
|
183
|
+
if service_desk_ids and request_type_ids:
|
|
184
|
+
for service_desk_id in service_desk_ids:
|
|
185
|
+
for request_type_id in request_type_ids:
|
|
186
|
+
combinations.append(
|
|
187
|
+
{
|
|
188
|
+
**base_filters,
|
|
189
|
+
"serviceDeskId": service_desk_id,
|
|
190
|
+
"requestTypeId": request_type_id,
|
|
191
|
+
}
|
|
192
|
+
)
|
|
193
|
+
elif service_desk_ids:
|
|
194
|
+
for service_desk_id in service_desk_ids:
|
|
195
|
+
combinations.append({**base_filters, "serviceDeskId": service_desk_id})
|
|
196
|
+
elif request_type_ids:
|
|
197
|
+
for request_type_id in request_type_ids:
|
|
198
|
+
combinations.append({**base_filters, "requestTypeId": request_type_id})
|
|
199
|
+
else:
|
|
200
|
+
combinations.append(dict(base_filters))
|
|
201
|
+
return combinations
|
|
202
|
+
|
|
203
|
+
def _sample_requests(self, requests: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
204
|
+
sampling = self.config.sampling
|
|
205
|
+
if sampling.strategy == SamplingStrategy.ALL:
|
|
206
|
+
return requests
|
|
207
|
+
|
|
208
|
+
limit = int(sampling.rows_per_page or 100)
|
|
209
|
+
if limit >= len(requests):
|
|
210
|
+
return requests
|
|
211
|
+
|
|
212
|
+
if sampling.strategy == SamplingStrategy.RANDOM:
|
|
213
|
+
return deterministic_sample(requests, limit)
|
|
214
|
+
|
|
215
|
+
sorted_requests = sorted(
|
|
216
|
+
requests,
|
|
217
|
+
key=self._request_sort_timestamp,
|
|
218
|
+
reverse=True,
|
|
219
|
+
)
|
|
220
|
+
return sorted_requests[:limit]
|
|
221
|
+
|
|
222
|
+
def _request_sort_timestamp(self, request: dict[str, Any]) -> datetime:
|
|
223
|
+
current_status = request.get("currentStatus")
|
|
224
|
+
if isinstance(current_status, dict):
|
|
225
|
+
status_date = current_status.get("statusDate")
|
|
226
|
+
parsed = self._parse_date_dto(status_date)
|
|
227
|
+
if parsed:
|
|
228
|
+
return parsed
|
|
229
|
+
return self._parse_date_dto(request.get("createdDate")) or datetime.now(UTC)
|
|
230
|
+
|
|
231
|
+
def _extract_request_assets(self, request: dict[str, Any]) -> list[SingleAssetScanResults]:
|
|
232
|
+
now = datetime.now(UTC)
|
|
233
|
+
issue_key = str(request.get("issueKey") or request.get("issueId") or "")
|
|
234
|
+
if not issue_key:
|
|
235
|
+
return []
|
|
236
|
+
|
|
237
|
+
request_url = self._request_external_url(request, issue_key)
|
|
238
|
+
request_hash = self.generate_hash_id(request_url)
|
|
239
|
+
summary = str(request.get("summary") or issue_key)
|
|
240
|
+
|
|
241
|
+
body_text, body_urls = self._request_body_text_and_urls(request)
|
|
242
|
+
comment_asset, comment_hashes, comment_urls = self._comments_asset(
|
|
243
|
+
issue_key, request_url, now
|
|
244
|
+
)
|
|
245
|
+
attachment_assets, attachment_hashes = self._attachment_assets(issue_key, request_hash, now)
|
|
246
|
+
|
|
247
|
+
all_url_hashes = [
|
|
248
|
+
self.generate_hash_id(url)
|
|
249
|
+
for url in (
|
|
250
|
+
normalize_http_url(value, base_url=self.base_url)
|
|
251
|
+
for value in [*body_urls, *comment_urls]
|
|
252
|
+
)
|
|
253
|
+
if url
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
request_links = dedupe_preserve_order(
|
|
257
|
+
[*comment_hashes, *attachment_hashes, *all_url_hashes]
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
request_metadata = {
|
|
261
|
+
"issue_key": issue_key,
|
|
262
|
+
"summary": summary,
|
|
263
|
+
"current_status": self._status_name(request),
|
|
264
|
+
"request_type": self._name_from_obj(request.get("requestType")),
|
|
265
|
+
"service_desk": self._name_from_obj(request.get("serviceDesk")),
|
|
266
|
+
"attachments_count": len(attachment_hashes),
|
|
267
|
+
"links_count": len(request_links),
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
request_text_lines = [
|
|
271
|
+
f"key={issue_key}",
|
|
272
|
+
f"summary={summary}",
|
|
273
|
+
f"status={self._status_name(request)}",
|
|
274
|
+
f"service_desk={self._name_from_obj(request.get('serviceDesk'))}",
|
|
275
|
+
f"request_type={self._name_from_obj(request.get('requestType'))}",
|
|
276
|
+
"",
|
|
277
|
+
body_text,
|
|
278
|
+
]
|
|
279
|
+
request_text = "\n".join(line for line in request_text_lines if line).strip()
|
|
280
|
+
self._asset_content_cache[request_hash] = (json_dumps(request_metadata), request_text)
|
|
281
|
+
|
|
282
|
+
request_asset = SingleAssetScanResults(
|
|
283
|
+
hash=request_hash,
|
|
284
|
+
checksum=self.calculate_checksum(request_metadata),
|
|
285
|
+
name=f"{issue_key}: {summary}",
|
|
286
|
+
external_url=request_url,
|
|
287
|
+
links=request_links,
|
|
288
|
+
asset_type=OutputAssetType.TXT,
|
|
289
|
+
source_id=self.source_id,
|
|
290
|
+
created_at=self._parse_date_dto(request.get("createdDate")) or now,
|
|
291
|
+
updated_at=self._request_sort_timestamp(request),
|
|
292
|
+
runner_id=self.runner_id,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
assets: list[SingleAssetScanResults] = [request_asset]
|
|
296
|
+
if comment_asset:
|
|
297
|
+
assets.append(comment_asset)
|
|
298
|
+
assets.extend(attachment_assets)
|
|
299
|
+
return assets
|
|
300
|
+
|
|
301
|
+
def _request_external_url(self, request: dict[str, Any], issue_key: str) -> str:
|
|
302
|
+
links = request.get("_links")
|
|
303
|
+
if isinstance(links, dict):
|
|
304
|
+
web_url = links.get("web")
|
|
305
|
+
if isinstance(web_url, str):
|
|
306
|
+
normalized = normalize_http_url(web_url, base_url=self.base_url)
|
|
307
|
+
if normalized:
|
|
308
|
+
return normalized
|
|
309
|
+
return f"{self.base_url}/browse/{issue_key}"
|
|
310
|
+
|
|
311
|
+
def _request_body_text_and_urls(self, request: dict[str, Any]) -> tuple[str, list[str]]:
|
|
312
|
+
lines: list[str] = []
|
|
313
|
+
urls: list[str] = []
|
|
314
|
+
field_values = request.get("requestFieldValues")
|
|
315
|
+
if not isinstance(field_values, list):
|
|
316
|
+
return "", []
|
|
317
|
+
for field in field_values:
|
|
318
|
+
if not isinstance(field, dict):
|
|
319
|
+
continue
|
|
320
|
+
label = str(field.get("label") or field.get("fieldId") or "field")
|
|
321
|
+
value_text, value_urls = self._text_and_urls(field.get("value"))
|
|
322
|
+
rendered_text, rendered_urls = self._text_and_urls(field.get("renderedValue"))
|
|
323
|
+
content = value_text or rendered_text
|
|
324
|
+
if content:
|
|
325
|
+
lines.append(f"{label}: {content}")
|
|
326
|
+
urls.extend(value_urls)
|
|
327
|
+
urls.extend(rendered_urls)
|
|
328
|
+
return "\n".join(lines), dedupe_preserve_order(urls)
|
|
329
|
+
|
|
330
|
+
def _comments_asset(
|
|
331
|
+
self,
|
|
332
|
+
issue_key: str,
|
|
333
|
+
request_url: str,
|
|
334
|
+
now: datetime,
|
|
335
|
+
) -> tuple[SingleAssetScanResults | None, list[str], list[str]]:
|
|
336
|
+
if not self.include_comments:
|
|
337
|
+
return None, [], []
|
|
338
|
+
|
|
339
|
+
comments = self.client.iter_servicedesk_values(
|
|
340
|
+
f"/rest/servicedeskapi/request/{issue_key}/comment",
|
|
341
|
+
limit=50,
|
|
342
|
+
)
|
|
343
|
+
if not comments:
|
|
344
|
+
return None, [], []
|
|
345
|
+
|
|
346
|
+
text_blocks: list[str] = []
|
|
347
|
+
urls: list[str] = []
|
|
348
|
+
for comment in comments:
|
|
349
|
+
body = comment.get("body")
|
|
350
|
+
if isinstance(body, str) and body.strip():
|
|
351
|
+
text_blocks.append(body.strip())
|
|
352
|
+
urls.extend(extract_urls_from_text(body))
|
|
353
|
+
|
|
354
|
+
combined_text = "\n\n".join(block for block in text_blocks if block).strip()
|
|
355
|
+
if not combined_text:
|
|
356
|
+
return None, [], dedupe_preserve_order(urls)
|
|
357
|
+
|
|
358
|
+
comments_url = f"{request_url}?view=comments"
|
|
359
|
+
comments_hash = self.generate_hash_id(comments_url)
|
|
360
|
+
|
|
361
|
+
comment_link_hashes = [
|
|
362
|
+
self.generate_hash_id(normalized)
|
|
363
|
+
for normalized in (
|
|
364
|
+
normalize_http_url(url, base_url=self.base_url)
|
|
365
|
+
for url in dedupe_preserve_order(urls)
|
|
366
|
+
)
|
|
367
|
+
if normalized
|
|
368
|
+
]
|
|
369
|
+
self._asset_content_cache[comments_hash] = (combined_text, combined_text)
|
|
370
|
+
|
|
371
|
+
asset = SingleAssetScanResults(
|
|
372
|
+
hash=comments_hash,
|
|
373
|
+
checksum=self.calculate_checksum(
|
|
374
|
+
{
|
|
375
|
+
"issue_key": issue_key,
|
|
376
|
+
"comments_count": len(comments),
|
|
377
|
+
"text_length": len(combined_text),
|
|
378
|
+
}
|
|
379
|
+
),
|
|
380
|
+
name=f"Comments for request {issue_key}",
|
|
381
|
+
external_url=comments_url,
|
|
382
|
+
links=comment_link_hashes,
|
|
383
|
+
asset_type=OutputAssetType.TXT,
|
|
384
|
+
source_id=self.source_id,
|
|
385
|
+
created_at=now,
|
|
386
|
+
updated_at=now,
|
|
387
|
+
runner_id=self.runner_id,
|
|
388
|
+
)
|
|
389
|
+
return asset, [comments_hash], urls
|
|
390
|
+
|
|
391
|
+
def _attachment_assets(
|
|
392
|
+
self,
|
|
393
|
+
issue_key: str,
|
|
394
|
+
request_hash: str,
|
|
395
|
+
now: datetime,
|
|
396
|
+
) -> tuple[list[SingleAssetScanResults], list[str]]:
|
|
397
|
+
if not self.include_attachments:
|
|
398
|
+
return [], []
|
|
399
|
+
|
|
400
|
+
attachments = self.client.iter_servicedesk_values(
|
|
401
|
+
f"/rest/servicedeskapi/request/{issue_key}/attachment",
|
|
402
|
+
limit=50,
|
|
403
|
+
)
|
|
404
|
+
assets: list[SingleAssetScanResults] = []
|
|
405
|
+
hashes: list[str] = []
|
|
406
|
+
for attachment in attachments:
|
|
407
|
+
if not isinstance(attachment, dict):
|
|
408
|
+
continue
|
|
409
|
+
links = attachment.get("_links")
|
|
410
|
+
content_url = links.get("content") if isinstance(links, dict) else None
|
|
411
|
+
if not isinstance(content_url, str):
|
|
412
|
+
continue
|
|
413
|
+
normalized_url = normalize_http_url(content_url, base_url=self.base_url)
|
|
414
|
+
if not normalized_url:
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
attachment_hash = self.generate_hash_id(normalized_url)
|
|
418
|
+
self._attachment_url_by_hash[attachment_hash] = normalized_url
|
|
419
|
+
mime = str(attachment.get("mimeType") or "").lower()
|
|
420
|
+
filename = str(attachment.get("filename") or "attachment")
|
|
421
|
+
self._attachment_name_by_hash[attachment_hash] = filename
|
|
422
|
+
metadata = {
|
|
423
|
+
"request_hash": request_hash,
|
|
424
|
+
"mime_type": mime,
|
|
425
|
+
"size": attachment.get("size"),
|
|
426
|
+
"filename": filename,
|
|
427
|
+
}
|
|
428
|
+
assets.append(
|
|
429
|
+
SingleAssetScanResults(
|
|
430
|
+
hash=attachment_hash,
|
|
431
|
+
checksum=self.calculate_checksum(metadata),
|
|
432
|
+
name=filename,
|
|
433
|
+
external_url=normalized_url,
|
|
434
|
+
links=[],
|
|
435
|
+
asset_type=self._asset_type_from_mime_or_name(mime, filename),
|
|
436
|
+
source_id=self.source_id,
|
|
437
|
+
created_at=now,
|
|
438
|
+
updated_at=now,
|
|
439
|
+
runner_id=self.runner_id,
|
|
440
|
+
)
|
|
441
|
+
)
|
|
442
|
+
hashes.append(attachment_hash)
|
|
443
|
+
return assets, hashes
|
|
444
|
+
|
|
445
|
+
def _name_from_obj(self, value: Any) -> str:
|
|
446
|
+
if isinstance(value, dict):
|
|
447
|
+
name = value.get("name")
|
|
448
|
+
if isinstance(name, str):
|
|
449
|
+
return name
|
|
450
|
+
return ""
|
|
451
|
+
|
|
452
|
+
def _status_name(self, request: dict[str, Any]) -> str:
|
|
453
|
+
current_status = request.get("currentStatus")
|
|
454
|
+
if isinstance(current_status, dict):
|
|
455
|
+
status = current_status.get("status")
|
|
456
|
+
if isinstance(status, str):
|
|
457
|
+
return status
|
|
458
|
+
return ""
|
|
459
|
+
|
|
460
|
+
def _parse_date_dto(self, value: Any) -> datetime | None:
|
|
461
|
+
if isinstance(value, str):
|
|
462
|
+
return parse_datetime(value)
|
|
463
|
+
if isinstance(value, dict):
|
|
464
|
+
for key in ("iso8601", "jira", "friendly"):
|
|
465
|
+
candidate = value.get(key)
|
|
466
|
+
if isinstance(candidate, str) and candidate.strip():
|
|
467
|
+
return parse_datetime(candidate)
|
|
468
|
+
return None
|
|
469
|
+
|
|
470
|
+
def _text_and_urls(self, value: Any) -> tuple[str, list[str]]:
|
|
471
|
+
if isinstance(value, str):
|
|
472
|
+
return value, extract_urls_from_text(value)
|
|
473
|
+
text, urls = parse_atlassian_document(value)
|
|
474
|
+
return text, dedupe_preserve_order(urls + extract_urls_from_text(text))
|
|
475
|
+
|
|
476
|
+
def _asset_type_from_mime_or_name(
|
|
477
|
+
self,
|
|
478
|
+
mime_type: str,
|
|
479
|
+
file_name: str,
|
|
480
|
+
) -> OutputAssetType:
|
|
481
|
+
mime_asset_type = self._asset_type_from_mime(mime_type)
|
|
482
|
+
if mime_asset_type != OutputAssetType.BINARY:
|
|
483
|
+
return mime_asset_type
|
|
484
|
+
if is_tabular_filename(file_name):
|
|
485
|
+
return OutputAssetType.TABLE
|
|
486
|
+
return mime_asset_type
|
|
487
|
+
|
|
488
|
+
def _asset_type_from_mime(self, mime_type: str) -> OutputAssetType:
|
|
489
|
+
normalized = mime_type.lower()
|
|
490
|
+
if normalized.startswith("image/"):
|
|
491
|
+
return OutputAssetType.IMAGE
|
|
492
|
+
if normalized.startswith("video/"):
|
|
493
|
+
return OutputAssetType.VIDEO
|
|
494
|
+
if normalized.startswith("audio/"):
|
|
495
|
+
return OutputAssetType.AUDIO
|
|
496
|
+
if is_tabular_mime_type(normalized):
|
|
497
|
+
return OutputAssetType.TABLE
|
|
498
|
+
if normalized in {
|
|
499
|
+
"text/plain",
|
|
500
|
+
"application/json",
|
|
501
|
+
"application/xml",
|
|
502
|
+
"text/xml",
|
|
503
|
+
}:
|
|
504
|
+
return OutputAssetType.TXT
|
|
505
|
+
if normalized == "text/html":
|
|
506
|
+
return OutputAssetType.URL
|
|
507
|
+
return OutputAssetType.BINARY
|
|
508
|
+
|
|
509
|
+
def _add_asset_if_new(
|
|
510
|
+
self,
|
|
511
|
+
assets: list[SingleAssetScanResults],
|
|
512
|
+
asset: SingleAssetScanResults,
|
|
513
|
+
) -> bool:
|
|
514
|
+
if asset.hash in self._seen_asset_hashes:
|
|
515
|
+
return False
|
|
516
|
+
self._seen_asset_hashes.add(asset.hash)
|
|
517
|
+
assets.append(asset)
|
|
518
|
+
return True
|
|
519
|
+
|
|
520
|
+
async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
|
|
521
|
+
normalized = normalize_http_url(asset_id, base_url=self.base_url)
|
|
522
|
+
if normalized:
|
|
523
|
+
asset_hash = self.generate_hash_id(normalized)
|
|
524
|
+
asset_id = asset_hash
|
|
525
|
+
|
|
526
|
+
attachment_url = self._attachment_url_by_hash.get(asset_id) or self._hash_to_url.get(
|
|
527
|
+
asset_id
|
|
528
|
+
)
|
|
529
|
+
if not attachment_url:
|
|
530
|
+
return None
|
|
531
|
+
|
|
532
|
+
try:
|
|
533
|
+
file_bytes, declared_mime = self.client.get_bytes(attachment_url)
|
|
534
|
+
except Exception as exc:
|
|
535
|
+
logger.warning(
|
|
536
|
+
"Failed to fetch Service Desk attachment bytes for %s: %s",
|
|
537
|
+
attachment_url,
|
|
538
|
+
exc,
|
|
539
|
+
)
|
|
540
|
+
return None
|
|
541
|
+
|
|
542
|
+
if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
|
|
543
|
+
file_bytes = file_bytes[: self.attachment_max_bytes]
|
|
544
|
+
|
|
545
|
+
mime_type = resolve_mime_type(
|
|
546
|
+
file_bytes,
|
|
547
|
+
declared_mime_type=declared_mime,
|
|
548
|
+
file_name=self._attachment_file_name(asset_id, attachment_url),
|
|
549
|
+
)
|
|
550
|
+
return file_bytes, mime_type
|
|
551
|
+
|
|
552
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
553
|
+
cached = self._asset_content_cache.get(asset_id)
|
|
554
|
+
if cached:
|
|
555
|
+
return cached
|
|
556
|
+
|
|
557
|
+
normalized = normalize_http_url(asset_id, base_url=self.base_url)
|
|
558
|
+
if normalized:
|
|
559
|
+
asset_hash = self.generate_hash_id(normalized)
|
|
560
|
+
cached = self._asset_content_cache.get(asset_hash)
|
|
561
|
+
if cached:
|
|
562
|
+
return cached
|
|
563
|
+
asset_id = asset_hash
|
|
564
|
+
|
|
565
|
+
attachment_url = self._attachment_url_by_hash.get(asset_id) or self._hash_to_url.get(
|
|
566
|
+
asset_id
|
|
567
|
+
)
|
|
568
|
+
if not attachment_url:
|
|
569
|
+
return None
|
|
570
|
+
|
|
571
|
+
try:
|
|
572
|
+
file_bytes, declared_mime = self.client.get_bytes(attachment_url)
|
|
573
|
+
except Exception as exc:
|
|
574
|
+
logger.warning("Failed to fetch Service Desk attachment %s: %s", attachment_url, exc)
|
|
575
|
+
return None
|
|
576
|
+
|
|
577
|
+
if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
|
|
578
|
+
file_bytes = file_bytes[: self.attachment_max_bytes]
|
|
579
|
+
|
|
580
|
+
parsed = self.parse_asset_bytes(
|
|
581
|
+
file_bytes,
|
|
582
|
+
declared_mime_type=declared_mime,
|
|
583
|
+
file_name=self._attachment_file_name(asset_id, attachment_url),
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
if parsed.text_content:
|
|
587
|
+
self._asset_content_cache[asset_id] = (parsed.raw_content, parsed.text_content)
|
|
588
|
+
return parsed.raw_content, parsed.text_content
|
|
589
|
+
return None
|
|
590
|
+
|
|
591
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
592
|
+
normalized = normalize_http_url(asset_id, base_url=self.base_url)
|
|
593
|
+
if not normalized:
|
|
594
|
+
raise ValueError(f"Invalid URL for hash: {asset_id}")
|
|
595
|
+
asset_hash = hash_url(normalized, base_url=self.base_url)
|
|
596
|
+
self._hash_to_url[asset_hash] = normalized
|
|
597
|
+
return asset_hash
|
|
598
|
+
|
|
599
|
+
def resolve_link_for_detection(self, link: str) -> str | None:
|
|
600
|
+
mapped = self._hash_to_url.get(link)
|
|
601
|
+
if mapped:
|
|
602
|
+
return mapped
|
|
603
|
+
return normalize_http_url(link)
|
|
604
|
+
|
|
605
|
+
def enrich_finding_location(
|
|
606
|
+
self,
|
|
607
|
+
finding: DetectionResult,
|
|
608
|
+
asset: SingleAssetScanResults,
|
|
609
|
+
text_content: str,
|
|
610
|
+
) -> None:
|
|
611
|
+
_ = text_content
|
|
612
|
+
finding.location = Location(path=asset.external_url)
|
|
613
|
+
|
|
614
|
+
def abort(self) -> None:
|
|
615
|
+
logger.info("Aborting Service Desk extraction...")
|
|
616
|
+
super().abort()
|
|
617
|
+
self.client.close()
|
|
618
|
+
|
|
619
|
+
def cleanup(self) -> None:
|
|
620
|
+
self.client.close()
|