classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,620 @@
1
+ import logging
2
+ from collections.abc import AsyncGenerator
3
+ from datetime import UTC, datetime
4
+ from typing import Any
5
+
6
+ from ...models.generated_input import (
7
+ SamplingStrategy,
8
+ ServiceDeskInput,
9
+ ServiceDeskOptional,
10
+ ServiceDeskOptionalConnection,
11
+ ServiceDeskOptionalContent,
12
+ )
13
+ from ...models.generated_single_asset_scan_results import (
14
+ AssetType as OutputAssetType,
15
+ )
16
+ from ...models.generated_single_asset_scan_results import (
17
+ DetectionResult,
18
+ Location,
19
+ SingleAssetScanResults,
20
+ )
21
+ from ...utils.file_parser import resolve_mime_type
22
+ from ...utils.hashing import hash_url, normalize_http_url
23
+ from ..atlassian_common import (
24
+ AtlassianCloudClient,
25
+ dedupe_preserve_order,
26
+ deterministic_sample,
27
+ extract_urls_from_text,
28
+ is_tabular_filename,
29
+ is_tabular_mime_type,
30
+ json_dumps,
31
+ normalize_atlassian_base_url,
32
+ parse_atlassian_document,
33
+ parse_datetime,
34
+ )
35
+ from ..base import BaseSource
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class ServiceDeskSource(BaseSource):
41
+ source_type = "servicedesk"
42
+
43
+ def __init__(
44
+ self,
45
+ recipe: dict[str, Any],
46
+ source_id: str | None = None,
47
+ runner_id: str | None = None,
48
+ ):
49
+ super().__init__(recipe, source_id=source_id, runner_id=runner_id)
50
+ self.config = ServiceDeskInput.model_validate(recipe)
51
+ self.runner_id = runner_id or "local-run"
52
+
53
+ self.base_url = normalize_atlassian_base_url(str(self.config.required.base_url))
54
+ connection = self._connection_options()
55
+ self.client = AtlassianCloudClient(
56
+ base_url=self.base_url,
57
+ account_email=str(self.config.required.account_email),
58
+ api_token=self.config.masked.api_token,
59
+ request_timeout_seconds=float(connection.request_timeout_seconds or 30),
60
+ max_retries=int(connection.max_retries or 3),
61
+ rate_limit_delay_seconds=float(connection.rate_limit_delay_seconds or 0),
62
+ )
63
+
64
+ content_options = self._content_options()
65
+ self.include_comments = content_options.include_comments is not False
66
+ self.include_attachments = content_options.include_attachments is not False
67
+ self.attachment_max_bytes = int(content_options.attachment_max_bytes or 5_242_880)
68
+
69
+ self._seen_asset_hashes: set[str] = set()
70
+ self._hash_to_url: dict[str, str] = {}
71
+ self._asset_content_cache: dict[str, tuple[str, str]] = {}
72
+ self._attachment_url_by_hash: dict[str, str] = {}
73
+
74
+ def _optional(self) -> ServiceDeskOptional:
75
+ if self.config.optional:
76
+ return self.config.optional
77
+ return ServiceDeskOptional()
78
+
79
+ def _connection_options(self) -> ServiceDeskOptionalConnection:
80
+ optional = self._optional()
81
+ if optional.connection:
82
+ return optional.connection
83
+ return ServiceDeskOptionalConnection()
84
+
85
+ def _content_options(self) -> ServiceDeskOptionalContent:
86
+ optional = self._optional()
87
+ if optional.content:
88
+ return optional.content
89
+ return ServiceDeskOptionalContent()
90
+
91
+ def test_connection(self) -> dict[str, Any]:
92
+ result = {
93
+ "timestamp": datetime.now(UTC).isoformat(),
94
+ "source_type": self.recipe.get("type"),
95
+ }
96
+ try:
97
+ self.client.get_json(
98
+ "/rest/servicedeskapi/servicedesk", params={"start": 0, "limit": 1}
99
+ )
100
+ result["status"] = "SUCCESS"
101
+ result["message"] = "Successfully connected to Jira Service Management API."
102
+ except Exception as exc:
103
+ result["status"] = "FAILURE"
104
+ result["message"] = f"Failed to connect to Jira Service Management API: {exc}"
105
+ return result
106
+
107
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
108
+ if self._aborted:
109
+ return
110
+
111
+ self._reset_runtime_state()
112
+
113
+ requests = self._fetch_requests()
114
+ sampled_requests = self._sample_requests(requests)
115
+
116
+ pending_batch: list[SingleAssetScanResults] = []
117
+ for request in sampled_requests:
118
+ if self._aborted:
119
+ break
120
+ assets = self._extract_request_assets(request)
121
+ for asset in assets:
122
+ if not self._add_asset_if_new(pending_batch, asset):
123
+ continue
124
+ while len(pending_batch) >= self.BATCH_SIZE:
125
+ to_emit = pending_batch[: self.BATCH_SIZE]
126
+ pending_batch = pending_batch[self.BATCH_SIZE :]
127
+ if to_emit:
128
+ yield to_emit
129
+
130
+ if pending_batch:
131
+ yield pending_batch
132
+
133
+ def _reset_runtime_state(self) -> None:
134
+ self._seen_asset_hashes = set()
135
+ self._hash_to_url = {}
136
+ self._asset_content_cache = {}
137
+ self._attachment_url_by_hash = {}
138
+ self._attachment_name_by_hash = {}
139
+
140
+ def _fetch_requests(self) -> list[dict[str, Any]]:
141
+ scope = self._optional().scope
142
+ base_filters: dict[str, Any] = {}
143
+ if scope:
144
+ if scope.search_term:
145
+ base_filters["searchTerm"] = str(scope.search_term)
146
+ if scope.request_status:
147
+ base_filters["requestStatus"] = str(scope.request_status)
148
+ if scope.request_ownership:
149
+ ownership = [
150
+ str(value).strip() for value in scope.request_ownership if str(value).strip()
151
+ ]
152
+ if ownership:
153
+ base_filters["requestOwnership"] = ownership
154
+ if scope.organization_id is not None:
155
+ base_filters["organizationId"] = int(scope.organization_id)
156
+
157
+ service_desk_ids = [int(value) for value in (scope.service_desk_ids or [])] if scope else []
158
+ request_type_ids = [int(value) for value in (scope.request_type_ids or [])] if scope else []
159
+
160
+ result_by_key: dict[str, dict[str, Any]] = {}
161
+
162
+ for filters in self._request_filter_combinations(
163
+ base_filters, service_desk_ids, request_type_ids
164
+ ):
165
+ for item in self.client.iter_servicedesk_values(
166
+ "/rest/servicedeskapi/request",
167
+ params=filters,
168
+ limit=50,
169
+ ):
170
+ key = str(item.get("issueKey") or item.get("issueId") or "")
171
+ if key:
172
+ result_by_key[key] = item
173
+
174
+ return list(result_by_key.values())
175
+
176
+ def _request_filter_combinations(
177
+ self,
178
+ base_filters: dict[str, Any],
179
+ service_desk_ids: list[int],
180
+ request_type_ids: list[int],
181
+ ) -> list[dict[str, Any]]:
182
+ combinations: list[dict[str, Any]] = []
183
+ if service_desk_ids and request_type_ids:
184
+ for service_desk_id in service_desk_ids:
185
+ for request_type_id in request_type_ids:
186
+ combinations.append(
187
+ {
188
+ **base_filters,
189
+ "serviceDeskId": service_desk_id,
190
+ "requestTypeId": request_type_id,
191
+ }
192
+ )
193
+ elif service_desk_ids:
194
+ for service_desk_id in service_desk_ids:
195
+ combinations.append({**base_filters, "serviceDeskId": service_desk_id})
196
+ elif request_type_ids:
197
+ for request_type_id in request_type_ids:
198
+ combinations.append({**base_filters, "requestTypeId": request_type_id})
199
+ else:
200
+ combinations.append(dict(base_filters))
201
+ return combinations
202
+
203
+ def _sample_requests(self, requests: list[dict[str, Any]]) -> list[dict[str, Any]]:
204
+ sampling = self.config.sampling
205
+ if sampling.strategy == SamplingStrategy.ALL:
206
+ return requests
207
+
208
+ limit = int(sampling.rows_per_page or 100)
209
+ if limit >= len(requests):
210
+ return requests
211
+
212
+ if sampling.strategy == SamplingStrategy.RANDOM:
213
+ return deterministic_sample(requests, limit)
214
+
215
+ sorted_requests = sorted(
216
+ requests,
217
+ key=self._request_sort_timestamp,
218
+ reverse=True,
219
+ )
220
+ return sorted_requests[:limit]
221
+
222
+ def _request_sort_timestamp(self, request: dict[str, Any]) -> datetime:
223
+ current_status = request.get("currentStatus")
224
+ if isinstance(current_status, dict):
225
+ status_date = current_status.get("statusDate")
226
+ parsed = self._parse_date_dto(status_date)
227
+ if parsed:
228
+ return parsed
229
+ return self._parse_date_dto(request.get("createdDate")) or datetime.now(UTC)
230
+
231
+ def _extract_request_assets(self, request: dict[str, Any]) -> list[SingleAssetScanResults]:
232
+ now = datetime.now(UTC)
233
+ issue_key = str(request.get("issueKey") or request.get("issueId") or "")
234
+ if not issue_key:
235
+ return []
236
+
237
+ request_url = self._request_external_url(request, issue_key)
238
+ request_hash = self.generate_hash_id(request_url)
239
+ summary = str(request.get("summary") or issue_key)
240
+
241
+ body_text, body_urls = self._request_body_text_and_urls(request)
242
+ comment_asset, comment_hashes, comment_urls = self._comments_asset(
243
+ issue_key, request_url, now
244
+ )
245
+ attachment_assets, attachment_hashes = self._attachment_assets(issue_key, request_hash, now)
246
+
247
+ all_url_hashes = [
248
+ self.generate_hash_id(url)
249
+ for url in (
250
+ normalize_http_url(value, base_url=self.base_url)
251
+ for value in [*body_urls, *comment_urls]
252
+ )
253
+ if url
254
+ ]
255
+
256
+ request_links = dedupe_preserve_order(
257
+ [*comment_hashes, *attachment_hashes, *all_url_hashes]
258
+ )
259
+
260
+ request_metadata = {
261
+ "issue_key": issue_key,
262
+ "summary": summary,
263
+ "current_status": self._status_name(request),
264
+ "request_type": self._name_from_obj(request.get("requestType")),
265
+ "service_desk": self._name_from_obj(request.get("serviceDesk")),
266
+ "attachments_count": len(attachment_hashes),
267
+ "links_count": len(request_links),
268
+ }
269
+
270
+ request_text_lines = [
271
+ f"key={issue_key}",
272
+ f"summary={summary}",
273
+ f"status={self._status_name(request)}",
274
+ f"service_desk={self._name_from_obj(request.get('serviceDesk'))}",
275
+ f"request_type={self._name_from_obj(request.get('requestType'))}",
276
+ "",
277
+ body_text,
278
+ ]
279
+ request_text = "\n".join(line for line in request_text_lines if line).strip()
280
+ self._asset_content_cache[request_hash] = (json_dumps(request_metadata), request_text)
281
+
282
+ request_asset = SingleAssetScanResults(
283
+ hash=request_hash,
284
+ checksum=self.calculate_checksum(request_metadata),
285
+ name=f"{issue_key}: {summary}",
286
+ external_url=request_url,
287
+ links=request_links,
288
+ asset_type=OutputAssetType.TXT,
289
+ source_id=self.source_id,
290
+ created_at=self._parse_date_dto(request.get("createdDate")) or now,
291
+ updated_at=self._request_sort_timestamp(request),
292
+ runner_id=self.runner_id,
293
+ )
294
+
295
+ assets: list[SingleAssetScanResults] = [request_asset]
296
+ if comment_asset:
297
+ assets.append(comment_asset)
298
+ assets.extend(attachment_assets)
299
+ return assets
300
+
301
+ def _request_external_url(self, request: dict[str, Any], issue_key: str) -> str:
302
+ links = request.get("_links")
303
+ if isinstance(links, dict):
304
+ web_url = links.get("web")
305
+ if isinstance(web_url, str):
306
+ normalized = normalize_http_url(web_url, base_url=self.base_url)
307
+ if normalized:
308
+ return normalized
309
+ return f"{self.base_url}/browse/{issue_key}"
310
+
311
+ def _request_body_text_and_urls(self, request: dict[str, Any]) -> tuple[str, list[str]]:
312
+ lines: list[str] = []
313
+ urls: list[str] = []
314
+ field_values = request.get("requestFieldValues")
315
+ if not isinstance(field_values, list):
316
+ return "", []
317
+ for field in field_values:
318
+ if not isinstance(field, dict):
319
+ continue
320
+ label = str(field.get("label") or field.get("fieldId") or "field")
321
+ value_text, value_urls = self._text_and_urls(field.get("value"))
322
+ rendered_text, rendered_urls = self._text_and_urls(field.get("renderedValue"))
323
+ content = value_text or rendered_text
324
+ if content:
325
+ lines.append(f"{label}: {content}")
326
+ urls.extend(value_urls)
327
+ urls.extend(rendered_urls)
328
+ return "\n".join(lines), dedupe_preserve_order(urls)
329
+
330
+ def _comments_asset(
331
+ self,
332
+ issue_key: str,
333
+ request_url: str,
334
+ now: datetime,
335
+ ) -> tuple[SingleAssetScanResults | None, list[str], list[str]]:
336
+ if not self.include_comments:
337
+ return None, [], []
338
+
339
+ comments = self.client.iter_servicedesk_values(
340
+ f"/rest/servicedeskapi/request/{issue_key}/comment",
341
+ limit=50,
342
+ )
343
+ if not comments:
344
+ return None, [], []
345
+
346
+ text_blocks: list[str] = []
347
+ urls: list[str] = []
348
+ for comment in comments:
349
+ body = comment.get("body")
350
+ if isinstance(body, str) and body.strip():
351
+ text_blocks.append(body.strip())
352
+ urls.extend(extract_urls_from_text(body))
353
+
354
+ combined_text = "\n\n".join(block for block in text_blocks if block).strip()
355
+ if not combined_text:
356
+ return None, [], dedupe_preserve_order(urls)
357
+
358
+ comments_url = f"{request_url}?view=comments"
359
+ comments_hash = self.generate_hash_id(comments_url)
360
+
361
+ comment_link_hashes = [
362
+ self.generate_hash_id(normalized)
363
+ for normalized in (
364
+ normalize_http_url(url, base_url=self.base_url)
365
+ for url in dedupe_preserve_order(urls)
366
+ )
367
+ if normalized
368
+ ]
369
+ self._asset_content_cache[comments_hash] = (combined_text, combined_text)
370
+
371
+ asset = SingleAssetScanResults(
372
+ hash=comments_hash,
373
+ checksum=self.calculate_checksum(
374
+ {
375
+ "issue_key": issue_key,
376
+ "comments_count": len(comments),
377
+ "text_length": len(combined_text),
378
+ }
379
+ ),
380
+ name=f"Comments for request {issue_key}",
381
+ external_url=comments_url,
382
+ links=comment_link_hashes,
383
+ asset_type=OutputAssetType.TXT,
384
+ source_id=self.source_id,
385
+ created_at=now,
386
+ updated_at=now,
387
+ runner_id=self.runner_id,
388
+ )
389
+ return asset, [comments_hash], urls
390
+
391
+ def _attachment_assets(
392
+ self,
393
+ issue_key: str,
394
+ request_hash: str,
395
+ now: datetime,
396
+ ) -> tuple[list[SingleAssetScanResults], list[str]]:
397
+ if not self.include_attachments:
398
+ return [], []
399
+
400
+ attachments = self.client.iter_servicedesk_values(
401
+ f"/rest/servicedeskapi/request/{issue_key}/attachment",
402
+ limit=50,
403
+ )
404
+ assets: list[SingleAssetScanResults] = []
405
+ hashes: list[str] = []
406
+ for attachment in attachments:
407
+ if not isinstance(attachment, dict):
408
+ continue
409
+ links = attachment.get("_links")
410
+ content_url = links.get("content") if isinstance(links, dict) else None
411
+ if not isinstance(content_url, str):
412
+ continue
413
+ normalized_url = normalize_http_url(content_url, base_url=self.base_url)
414
+ if not normalized_url:
415
+ continue
416
+
417
+ attachment_hash = self.generate_hash_id(normalized_url)
418
+ self._attachment_url_by_hash[attachment_hash] = normalized_url
419
+ mime = str(attachment.get("mimeType") or "").lower()
420
+ filename = str(attachment.get("filename") or "attachment")
421
+ self._attachment_name_by_hash[attachment_hash] = filename
422
+ metadata = {
423
+ "request_hash": request_hash,
424
+ "mime_type": mime,
425
+ "size": attachment.get("size"),
426
+ "filename": filename,
427
+ }
428
+ assets.append(
429
+ SingleAssetScanResults(
430
+ hash=attachment_hash,
431
+ checksum=self.calculate_checksum(metadata),
432
+ name=filename,
433
+ external_url=normalized_url,
434
+ links=[],
435
+ asset_type=self._asset_type_from_mime_or_name(mime, filename),
436
+ source_id=self.source_id,
437
+ created_at=now,
438
+ updated_at=now,
439
+ runner_id=self.runner_id,
440
+ )
441
+ )
442
+ hashes.append(attachment_hash)
443
+ return assets, hashes
444
+
445
+ def _name_from_obj(self, value: Any) -> str:
446
+ if isinstance(value, dict):
447
+ name = value.get("name")
448
+ if isinstance(name, str):
449
+ return name
450
+ return ""
451
+
452
+ def _status_name(self, request: dict[str, Any]) -> str:
453
+ current_status = request.get("currentStatus")
454
+ if isinstance(current_status, dict):
455
+ status = current_status.get("status")
456
+ if isinstance(status, str):
457
+ return status
458
+ return ""
459
+
460
+ def _parse_date_dto(self, value: Any) -> datetime | None:
461
+ if isinstance(value, str):
462
+ return parse_datetime(value)
463
+ if isinstance(value, dict):
464
+ for key in ("iso8601", "jira", "friendly"):
465
+ candidate = value.get(key)
466
+ if isinstance(candidate, str) and candidate.strip():
467
+ return parse_datetime(candidate)
468
+ return None
469
+
470
+ def _text_and_urls(self, value: Any) -> tuple[str, list[str]]:
471
+ if isinstance(value, str):
472
+ return value, extract_urls_from_text(value)
473
+ text, urls = parse_atlassian_document(value)
474
+ return text, dedupe_preserve_order(urls + extract_urls_from_text(text))
475
+
476
+ def _asset_type_from_mime_or_name(
477
+ self,
478
+ mime_type: str,
479
+ file_name: str,
480
+ ) -> OutputAssetType:
481
+ mime_asset_type = self._asset_type_from_mime(mime_type)
482
+ if mime_asset_type != OutputAssetType.BINARY:
483
+ return mime_asset_type
484
+ if is_tabular_filename(file_name):
485
+ return OutputAssetType.TABLE
486
+ return mime_asset_type
487
+
488
+ def _asset_type_from_mime(self, mime_type: str) -> OutputAssetType:
489
+ normalized = mime_type.lower()
490
+ if normalized.startswith("image/"):
491
+ return OutputAssetType.IMAGE
492
+ if normalized.startswith("video/"):
493
+ return OutputAssetType.VIDEO
494
+ if normalized.startswith("audio/"):
495
+ return OutputAssetType.AUDIO
496
+ if is_tabular_mime_type(normalized):
497
+ return OutputAssetType.TABLE
498
+ if normalized in {
499
+ "text/plain",
500
+ "application/json",
501
+ "application/xml",
502
+ "text/xml",
503
+ }:
504
+ return OutputAssetType.TXT
505
+ if normalized == "text/html":
506
+ return OutputAssetType.URL
507
+ return OutputAssetType.BINARY
508
+
509
+ def _add_asset_if_new(
510
+ self,
511
+ assets: list[SingleAssetScanResults],
512
+ asset: SingleAssetScanResults,
513
+ ) -> bool:
514
+ if asset.hash in self._seen_asset_hashes:
515
+ return False
516
+ self._seen_asset_hashes.add(asset.hash)
517
+ assets.append(asset)
518
+ return True
519
+
520
+ async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
521
+ normalized = normalize_http_url(asset_id, base_url=self.base_url)
522
+ if normalized:
523
+ asset_hash = self.generate_hash_id(normalized)
524
+ asset_id = asset_hash
525
+
526
+ attachment_url = self._attachment_url_by_hash.get(asset_id) or self._hash_to_url.get(
527
+ asset_id
528
+ )
529
+ if not attachment_url:
530
+ return None
531
+
532
+ try:
533
+ file_bytes, declared_mime = self.client.get_bytes(attachment_url)
534
+ except Exception as exc:
535
+ logger.warning(
536
+ "Failed to fetch Service Desk attachment bytes for %s: %s",
537
+ attachment_url,
538
+ exc,
539
+ )
540
+ return None
541
+
542
+ if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
543
+ file_bytes = file_bytes[: self.attachment_max_bytes]
544
+
545
+ mime_type = resolve_mime_type(
546
+ file_bytes,
547
+ declared_mime_type=declared_mime,
548
+ file_name=self._attachment_file_name(asset_id, attachment_url),
549
+ )
550
+ return file_bytes, mime_type
551
+
552
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
553
+ cached = self._asset_content_cache.get(asset_id)
554
+ if cached:
555
+ return cached
556
+
557
+ normalized = normalize_http_url(asset_id, base_url=self.base_url)
558
+ if normalized:
559
+ asset_hash = self.generate_hash_id(normalized)
560
+ cached = self._asset_content_cache.get(asset_hash)
561
+ if cached:
562
+ return cached
563
+ asset_id = asset_hash
564
+
565
+ attachment_url = self._attachment_url_by_hash.get(asset_id) or self._hash_to_url.get(
566
+ asset_id
567
+ )
568
+ if not attachment_url:
569
+ return None
570
+
571
+ try:
572
+ file_bytes, declared_mime = self.client.get_bytes(attachment_url)
573
+ except Exception as exc:
574
+ logger.warning("Failed to fetch Service Desk attachment %s: %s", attachment_url, exc)
575
+ return None
576
+
577
+ if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
578
+ file_bytes = file_bytes[: self.attachment_max_bytes]
579
+
580
+ parsed = self.parse_asset_bytes(
581
+ file_bytes,
582
+ declared_mime_type=declared_mime,
583
+ file_name=self._attachment_file_name(asset_id, attachment_url),
584
+ )
585
+
586
+ if parsed.text_content:
587
+ self._asset_content_cache[asset_id] = (parsed.raw_content, parsed.text_content)
588
+ return parsed.raw_content, parsed.text_content
589
+ return None
590
+
591
+ def generate_hash_id(self, asset_id: str) -> str:
592
+ normalized = normalize_http_url(asset_id, base_url=self.base_url)
593
+ if not normalized:
594
+ raise ValueError(f"Invalid URL for hash: {asset_id}")
595
+ asset_hash = hash_url(normalized, base_url=self.base_url)
596
+ self._hash_to_url[asset_hash] = normalized
597
+ return asset_hash
598
+
599
+ def resolve_link_for_detection(self, link: str) -> str | None:
600
+ mapped = self._hash_to_url.get(link)
601
+ if mapped:
602
+ return mapped
603
+ return normalize_http_url(link)
604
+
605
+ def enrich_finding_location(
606
+ self,
607
+ finding: DetectionResult,
608
+ asset: SingleAssetScanResults,
609
+ text_content: str,
610
+ ) -> None:
611
+ _ = text_content
612
+ finding.location = Location(path=asset.external_url)
613
+
614
+ def abort(self) -> None:
615
+ logger.info("Aborting Service Desk extraction...")
616
+ super().abort()
617
+ self.client.close()
618
+
619
+ def cleanup(self) -> None:
620
+ self.client.close()
@@ -0,0 +1,3 @@
1
+ from .source import SlackSource
2
+
3
+ __all__ = ["SlackSource"]