classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,733 @@
1
+ import logging
2
+ from collections.abc import AsyncGenerator
3
+ from datetime import UTC, datetime
4
+ from typing import Any
5
+ from urllib.parse import urlsplit
6
+
7
+ from bs4 import BeautifulSoup
8
+
9
+ from ...models.generated_input import (
10
+ ConfluenceInput,
11
+ ConfluenceOptional,
12
+ ConfluenceOptionalConnection,
13
+ ConfluenceOptionalContent,
14
+ ConfluenceOptionalScope,
15
+ SamplingStrategy,
16
+ )
17
+ from ...models.generated_single_asset_scan_results import (
18
+ AssetType as OutputAssetType,
19
+ )
20
+ from ...models.generated_single_asset_scan_results import (
21
+ DetectionResult,
22
+ Location,
23
+ SingleAssetScanResults,
24
+ )
25
+ from ...utils.content_extraction import html_to_text
26
+ from ...utils.file_parser import resolve_mime_type
27
+ from ...utils.hashing import hash_url, normalize_http_url
28
+ from ..atlassian_common import (
29
+ AtlassianCloudClient,
30
+ dedupe_preserve_order,
31
+ deterministic_sample,
32
+ extract_urls_from_text,
33
+ is_tabular_mime_type,
34
+ looks_like_file_asset,
35
+ normalize_atlassian_base_url,
36
+ parse_atlassian_document,
37
+ parse_datetime,
38
+ )
39
+ from ..base import BaseSource
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+ FILE_EXTENSION_HINTS: dict[str, OutputAssetType] = {
44
+ ".png": OutputAssetType.IMAGE,
45
+ ".jpg": OutputAssetType.IMAGE,
46
+ ".jpeg": OutputAssetType.IMAGE,
47
+ ".gif": OutputAssetType.IMAGE,
48
+ ".webp": OutputAssetType.IMAGE,
49
+ ".svg": OutputAssetType.IMAGE,
50
+ ".bmp": OutputAssetType.IMAGE,
51
+ ".ico": OutputAssetType.IMAGE,
52
+ ".mp4": OutputAssetType.VIDEO,
53
+ ".webm": OutputAssetType.VIDEO,
54
+ ".mov": OutputAssetType.VIDEO,
55
+ ".mkv": OutputAssetType.VIDEO,
56
+ ".avi": OutputAssetType.VIDEO,
57
+ ".mp3": OutputAssetType.AUDIO,
58
+ ".wav": OutputAssetType.AUDIO,
59
+ ".aac": OutputAssetType.AUDIO,
60
+ ".ogg": OutputAssetType.AUDIO,
61
+ ".pdf": OutputAssetType.BINARY,
62
+ ".doc": OutputAssetType.BINARY,
63
+ ".docx": OutputAssetType.BINARY,
64
+ ".xls": OutputAssetType.TABLE,
65
+ ".xlsx": OutputAssetType.TABLE,
66
+ ".ppt": OutputAssetType.BINARY,
67
+ ".pptx": OutputAssetType.BINARY,
68
+ ".zip": OutputAssetType.BINARY,
69
+ ".rar": OutputAssetType.BINARY,
70
+ ".7z": OutputAssetType.BINARY,
71
+ ".tar": OutputAssetType.BINARY,
72
+ ".gz": OutputAssetType.BINARY,
73
+ ".parquet": OutputAssetType.TABLE,
74
+ ".json": OutputAssetType.TXT,
75
+ ".xml": OutputAssetType.TXT,
76
+ ".txt": OutputAssetType.TXT,
77
+ ".csv": OutputAssetType.TABLE,
78
+ ".tsv": OutputAssetType.TABLE,
79
+ ".md": OutputAssetType.TXT,
80
+ }
81
+
82
+
83
+ class ConfluenceSource(BaseSource):
84
+ source_type = "confluence"
85
+
86
+ def __init__(
87
+ self,
88
+ recipe: dict[str, Any],
89
+ source_id: str | None = None,
90
+ runner_id: str | None = None,
91
+ ):
92
+ super().__init__(recipe, source_id=source_id, runner_id=runner_id)
93
+ self.config = ConfluenceInput.model_validate(recipe)
94
+ self.runner_id = runner_id or "local-run"
95
+
96
+ self.base_url = normalize_atlassian_base_url(
97
+ str(self.config.required.base_url),
98
+ strip_wiki=True,
99
+ )
100
+ self.api_base = f"{self.base_url}/wiki/api/v2"
101
+
102
+ connection = self._connection_options()
103
+ self.client = AtlassianCloudClient(
104
+ base_url=self.base_url,
105
+ account_email=str(self.config.required.account_email),
106
+ api_token=self.config.masked.api_token,
107
+ request_timeout_seconds=float(connection.request_timeout_seconds or 30),
108
+ max_retries=int(connection.max_retries or 3),
109
+ rate_limit_delay_seconds=float(connection.rate_limit_delay_seconds or 0),
110
+ )
111
+
112
+ content_options = self._content_options()
113
+ self.include_footer_comments = content_options.include_footer_comments is not False
114
+ self.include_inline_comments = content_options.include_inline_comments is not False
115
+ self.include_attachments = content_options.include_attachments is not False
116
+ self.include_linked_file_assets = content_options.include_linked_file_assets is not False
117
+ self.attachment_max_bytes = int(content_options.attachment_max_bytes or 5_242_880)
118
+
119
+ self._seen_asset_hashes: set[str] = set()
120
+ self._hash_to_url: dict[str, str] = {}
121
+ self._page_content_cache: dict[str, tuple[str, str]] = {}
122
+ self._asset_content_cache: dict[str, tuple[str, str]] = {}
123
+ self._attachment_download_url_by_hash: dict[str, str] = {}
124
+
125
+ def _optional(self) -> ConfluenceOptional:
126
+ if self.config.optional:
127
+ return self.config.optional
128
+ return ConfluenceOptional()
129
+
130
+ def _connection_options(self) -> ConfluenceOptionalConnection:
131
+ optional = self._optional()
132
+ if optional.connection:
133
+ return optional.connection
134
+ return ConfluenceOptionalConnection()
135
+
136
+ def _scope_options(self) -> ConfluenceOptionalScope:
137
+ optional = self._optional()
138
+ if optional.scope:
139
+ return optional.scope
140
+ return ConfluenceOptionalScope()
141
+
142
+ def _content_options(self) -> ConfluenceOptionalContent:
143
+ optional = self._optional()
144
+ if optional.content:
145
+ return optional.content
146
+ return ConfluenceOptionalContent()
147
+
148
+ def test_connection(self) -> dict[str, Any]:
149
+ result = {
150
+ "timestamp": datetime.now(UTC).isoformat(),
151
+ "source_type": self.recipe.get("type"),
152
+ }
153
+ try:
154
+ self.client.get_json("/wiki/api/v2/spaces", params={"limit": 1})
155
+ result["status"] = "SUCCESS"
156
+ result["message"] = "Successfully connected to Confluence Cloud API."
157
+ except Exception as exc:
158
+ result["status"] = "FAILURE"
159
+ result["message"] = f"Failed to connect to Confluence Cloud API: {exc}"
160
+ return result
161
+
162
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
163
+ if self._aborted:
164
+ return
165
+
166
+ self._reset_runtime_state()
167
+
168
+ page_refs = self._discover_page_refs()
169
+ sampled_refs = self._sample_page_refs(page_refs)
170
+
171
+ pending_batch: list[SingleAssetScanResults] = []
172
+ for page_ref in sampled_refs:
173
+ if self._aborted:
174
+ break
175
+
176
+ assets = self._extract_page_assets(page_ref)
177
+ for asset in assets:
178
+ if not self._add_asset_if_new(pending_batch, asset):
179
+ continue
180
+ while len(pending_batch) >= self.BATCH_SIZE:
181
+ to_emit = pending_batch[: self.BATCH_SIZE]
182
+ pending_batch = pending_batch[self.BATCH_SIZE :]
183
+ if to_emit:
184
+ yield to_emit
185
+
186
+ if pending_batch:
187
+ yield pending_batch
188
+
189
+ def _reset_runtime_state(self) -> None:
190
+ self._seen_asset_hashes = set()
191
+ self._hash_to_url = {}
192
+ self._page_content_cache = {}
193
+ self._asset_content_cache = {}
194
+ self._attachment_download_url_by_hash = {}
195
+ self._attachment_name_by_hash = {}
196
+
197
+ def _discover_page_refs(self) -> list[dict[str, Any]]:
198
+ refs: list[dict[str, Any]] = []
199
+ spaces = self._fetch_spaces()
200
+ for space in spaces:
201
+ if self._aborted:
202
+ break
203
+ space_id = str(space.get("id") or "")
204
+ if not space_id:
205
+ continue
206
+ page_results = self.client.iter_confluence_results(
207
+ f"/wiki/api/v2/spaces/{space_id}/pages",
208
+ params={"limit": 250, "body-format": "storage"},
209
+ )
210
+ for page in page_results:
211
+ page_id = str(page.get("id") or "")
212
+ if not page_id:
213
+ continue
214
+ refs.append(
215
+ {
216
+ "space_id": space_id,
217
+ "space": space,
218
+ "page_id": page_id,
219
+ "title": page.get("title"),
220
+ "created_at": page.get("createdAt"),
221
+ "version_created_at": (
222
+ page.get("version", {}).get("createdAt")
223
+ if isinstance(page.get("version"), dict)
224
+ else None
225
+ ),
226
+ }
227
+ )
228
+ return refs
229
+
230
+ def _fetch_spaces(self) -> list[dict[str, Any]]:
231
+ params: dict[str, Any] = {"limit": 250}
232
+ spaces_filter = self._scope_options().spaces
233
+ if spaces_filter:
234
+ if spaces_filter.ids:
235
+ params["ids"] = ",".join(str(v) for v in spaces_filter.ids)
236
+ if spaces_filter.keys:
237
+ params["keys"] = ",".join(str(v) for v in spaces_filter.keys)
238
+ if spaces_filter.type:
239
+ params["type"] = str(spaces_filter.type.value)
240
+ if spaces_filter.status:
241
+ params["status"] = str(spaces_filter.status.value)
242
+ if spaces_filter.labels:
243
+ params["labels"] = ",".join(str(v) for v in spaces_filter.labels)
244
+ return self.client.iter_confluence_results("/wiki/api/v2/spaces", params=params)
245
+
246
+ def _sample_page_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
247
+ sampling = self.config.sampling
248
+ if sampling.strategy == SamplingStrategy.ALL:
249
+ return refs
250
+
251
+ limit = int(sampling.rows_per_page or 100)
252
+ if limit >= len(refs):
253
+ return refs
254
+
255
+ if sampling.strategy == SamplingStrategy.RANDOM:
256
+ return deterministic_sample(refs, limit)
257
+
258
+ refs_sorted = sorted(
259
+ refs,
260
+ key=lambda ref: parse_datetime(
261
+ str(ref.get("version_created_at") or ref.get("created_at") or "")
262
+ ),
263
+ reverse=True,
264
+ )
265
+ return refs_sorted[:limit]
266
+
267
+ def _extract_page_assets(self, ref: dict[str, Any]) -> list[SingleAssetScanResults]:
268
+ page_id = str(ref["page_id"])
269
+ page = self.client.get_json(
270
+ f"/wiki/api/v2/pages/{page_id}",
271
+ params={"body-format": "storage"},
272
+ )
273
+
274
+ now = datetime.now(UTC)
275
+ title = str(page.get("title") or f"Confluence Page {page_id}")
276
+ page_url = self._page_url(page, page_id)
277
+ page_hash = self.generate_hash_id(page_url)
278
+ body_storage = self._extract_storage_html(page.get("body"))
279
+ body_text = html_to_text(body_storage)
280
+ self._page_content_cache[page_hash] = (body_storage, body_text)
281
+
282
+ related_assets: list[SingleAssetScanResults] = []
283
+ related_hashes: list[str] = []
284
+
285
+ body_links = self._extract_urls_from_html(body_storage)
286
+ body_link_hashes = [self.generate_hash_id(link) for link in body_links]
287
+ related_hashes.extend(body_link_hashes)
288
+
289
+ if self.include_linked_file_assets:
290
+ for link in body_links:
291
+ if not looks_like_file_asset(link):
292
+ continue
293
+ file_asset = self._make_linked_file_asset(link, page_hash, now)
294
+ if file_asset:
295
+ related_assets.append(file_asset)
296
+ related_hashes.append(file_asset.hash)
297
+
298
+ if self.include_attachments:
299
+ attachment_assets, attachment_hashes = self._extract_attachment_assets(
300
+ page_id, page_hash, now
301
+ )
302
+ related_assets.extend(attachment_assets)
303
+ related_hashes.extend(attachment_hashes)
304
+
305
+ comments_asset, comment_hashes = self._extract_comments_asset(page_id, page_url, now)
306
+ if comments_asset is not None:
307
+ related_assets.append(comments_asset)
308
+ related_hashes.extend(comment_hashes)
309
+
310
+ page_metadata = {
311
+ "page_id": page_id,
312
+ "space_id": page.get("spaceId"),
313
+ "title": title,
314
+ "status": page.get("status"),
315
+ "links_count": len(related_hashes),
316
+ }
317
+ page_asset = SingleAssetScanResults(
318
+ hash=page_hash,
319
+ checksum=self.calculate_checksum(page_metadata),
320
+ name=title,
321
+ external_url=page_url,
322
+ links=dedupe_preserve_order(related_hashes),
323
+ asset_type=OutputAssetType.URL,
324
+ source_id=self.source_id,
325
+ created_at=parse_datetime(str(page.get("createdAt") or "")),
326
+ updated_at=parse_datetime(
327
+ str(
328
+ page.get("version", {}).get("createdAt")
329
+ if isinstance(page.get("version"), dict)
330
+ else ""
331
+ )
332
+ ),
333
+ runner_id=self.runner_id,
334
+ )
335
+
336
+ return [page_asset, *related_assets]
337
+
338
+ def _extract_attachment_assets(
339
+ self,
340
+ page_id: str,
341
+ page_hash: str,
342
+ now: datetime,
343
+ ) -> tuple[list[SingleAssetScanResults], list[str]]:
344
+ assets: list[SingleAssetScanResults] = []
345
+ hashes: list[str] = []
346
+ attachments = self.client.iter_confluence_results(
347
+ f"/wiki/api/v2/pages/{page_id}/attachments",
348
+ params={"limit": 250},
349
+ )
350
+ for attachment in attachments:
351
+ attachment_url = self._attachment_url(attachment)
352
+ if not attachment_url:
353
+ continue
354
+
355
+ attachment_hash = self.generate_hash_id(attachment_url)
356
+ attachment_name = str(attachment.get("title") or f"Attachment {attachment.get('id')}")
357
+ self._attachment_name_by_hash[attachment_hash] = attachment_name
358
+ mime = str(attachment.get("mediaType") or "").lower()
359
+ asset_type = self._asset_type_from_mime_or_url(mime, attachment_url)
360
+ metadata = {
361
+ "attachment_id": attachment.get("id"),
362
+ "title": attachment_name,
363
+ "media_type": mime,
364
+ "file_size": attachment.get("fileSize"),
365
+ "page_hash": page_hash,
366
+ }
367
+
368
+ download_url = self._attachment_download_url(attachment)
369
+ if download_url:
370
+ self._attachment_download_url_by_hash[attachment_hash] = download_url
371
+
372
+ assets.append(
373
+ SingleAssetScanResults(
374
+ hash=attachment_hash,
375
+ checksum=self.calculate_checksum(metadata),
376
+ name=attachment_name,
377
+ external_url=attachment_url,
378
+ links=[],
379
+ asset_type=asset_type,
380
+ source_id=self.source_id,
381
+ created_at=now,
382
+ updated_at=now,
383
+ runner_id=self.runner_id,
384
+ )
385
+ )
386
+ hashes.append(attachment_hash)
387
+ return assets, hashes
388
+
389
+ def _extract_comments_asset(
390
+ self,
391
+ page_id: str,
392
+ page_url: str,
393
+ now: datetime,
394
+ ) -> tuple[SingleAssetScanResults | None, list[str]]:
395
+ comment_items: list[dict[str, Any]] = []
396
+ if self.include_footer_comments:
397
+ comment_items.extend(
398
+ self.client.iter_confluence_results(
399
+ f"/wiki/api/v2/pages/{page_id}/footer-comments",
400
+ params={"limit": 250, "body-format": "storage"},
401
+ )
402
+ )
403
+ if self.include_inline_comments:
404
+ comment_items.extend(
405
+ self.client.iter_confluence_results(
406
+ f"/wiki/api/v2/pages/{page_id}/inline-comments",
407
+ params={"limit": 250, "body-format": "storage"},
408
+ )
409
+ )
410
+
411
+ if not comment_items:
412
+ return None, []
413
+
414
+ text_blocks: list[str] = []
415
+ comment_urls: list[str] = []
416
+ for comment in comment_items:
417
+ text, urls = self._comment_text_and_urls(comment)
418
+ if text:
419
+ text_blocks.append(text)
420
+ comment_urls.extend(urls)
421
+
422
+ combined_text = "\n\n".join(text_blocks).strip()
423
+ if not combined_text:
424
+ return None, []
425
+
426
+ # Keep a distinct, URL-stable comments asset identifier (fragments are stripped in URL normalization).
427
+ comments_url = f"{page_url}?view=comments"
428
+ comments_hash = self.generate_hash_id(comments_url)
429
+ self._asset_content_cache[comments_hash] = (combined_text, combined_text)
430
+
431
+ comment_link_hashes = [
432
+ self.generate_hash_id(normalized)
433
+ for normalized in (
434
+ normalize_http_url(url, base_url=self.base_url)
435
+ for url in dedupe_preserve_order(comment_urls)
436
+ )
437
+ if normalized
438
+ ]
439
+
440
+ comments_asset = SingleAssetScanResults(
441
+ hash=comments_hash,
442
+ checksum=self.calculate_checksum(
443
+ {
444
+ "page_id": page_id,
445
+ "comments_count": len(comment_items),
446
+ "text_length": len(combined_text),
447
+ }
448
+ ),
449
+ name=f"Comments for page {page_id}",
450
+ external_url=comments_url,
451
+ links=comment_link_hashes,
452
+ asset_type=OutputAssetType.TXT,
453
+ source_id=self.source_id,
454
+ created_at=now,
455
+ updated_at=now,
456
+ runner_id=self.runner_id,
457
+ )
458
+ return comments_asset, [comments_hash]
459
+
460
+ def _comment_text_and_urls(self, comment: dict[str, Any]) -> tuple[str, list[str]]:
461
+ body = comment.get("body")
462
+ if not isinstance(body, dict):
463
+ return "", []
464
+
465
+ for body_key in ("storage", "view"):
466
+ candidate = body.get(body_key)
467
+ if isinstance(candidate, dict):
468
+ value = candidate.get("value")
469
+ if isinstance(value, str) and value:
470
+ soup = BeautifulSoup(value, "html.parser")
471
+ urls = list(self._extract_urls_from_html(value))
472
+ text = html_to_text(value)
473
+ if text:
474
+ return text, urls + extract_urls_from_text(soup.get_text(" "))
475
+
476
+ atlas = body.get("atlas_doc_format")
477
+ if isinstance(atlas, dict):
478
+ return parse_atlassian_document(atlas)
479
+ if isinstance(atlas, str):
480
+ try:
481
+ parsed = parse_atlassian_document(atlas)
482
+ except Exception:
483
+ parsed = (atlas, extract_urls_from_text(atlas))
484
+ return parsed
485
+
486
+ return parse_atlassian_document(body)
487
+
488
+ def _extract_storage_html(self, body: Any) -> str:
489
+ if not isinstance(body, dict):
490
+ return ""
491
+ storage = body.get("storage")
492
+ if not isinstance(storage, dict):
493
+ return ""
494
+ value = storage.get("value")
495
+ if isinstance(value, str):
496
+ return value
497
+ return ""
498
+
499
+ def _extract_urls_from_html(self, html_content: str) -> list[str]:
500
+ if not html_content:
501
+ return []
502
+ soup = BeautifulSoup(html_content, "html.parser")
503
+ links: list[str] = []
504
+ for tag in soup.find_all(["a", "img", "source"]):
505
+ for key in ("href", "src"):
506
+ raw = tag.get(key)
507
+ if not isinstance(raw, str):
508
+ continue
509
+ normalized = normalize_http_url(raw, base_url=self.base_url)
510
+ if normalized:
511
+ links.append(normalized)
512
+ return dedupe_preserve_order(links)
513
+
514
+ def _page_url(self, page: dict[str, Any], page_id: str) -> str:
515
+ links = page.get("_links")
516
+ if isinstance(links, dict):
517
+ webui = links.get("webui")
518
+ if isinstance(webui, str) and webui:
519
+ normalized = normalize_http_url(webui, base_url=self.base_url)
520
+ if normalized:
521
+ return normalized
522
+ return f"{self.base_url}/wiki/pages/{page_id}"
523
+
524
+ def _attachment_url(self, attachment: dict[str, Any]) -> str | None:
525
+ for key in ("downloadLink", "webuiLink"):
526
+ value = attachment.get(key)
527
+ if isinstance(value, str):
528
+ normalized = normalize_http_url(value, base_url=self.base_url)
529
+ if normalized:
530
+ return normalized
531
+ links = attachment.get("_links")
532
+ if isinstance(links, dict):
533
+ for key in ("download", "webui"):
534
+ value = links.get(key)
535
+ if isinstance(value, str):
536
+ normalized = normalize_http_url(value, base_url=self.base_url)
537
+ if normalized:
538
+ return normalized
539
+ return None
540
+
541
+ def _attachment_download_url(self, attachment: dict[str, Any]) -> str | None:
542
+ value = attachment.get("downloadLink")
543
+ if isinstance(value, str):
544
+ normalized = normalize_http_url(value, base_url=self.base_url)
545
+ if normalized:
546
+ return normalized
547
+ links = attachment.get("_links")
548
+ if isinstance(links, dict):
549
+ link_value = links.get("download")
550
+ if isinstance(link_value, str):
551
+ normalized = normalize_http_url(link_value, base_url=self.base_url)
552
+ if normalized:
553
+ return normalized
554
+ return None
555
+
556
+ def _make_linked_file_asset(
557
+ self,
558
+ url: str,
559
+ page_hash: str,
560
+ now: datetime,
561
+ ) -> SingleAssetScanResults | None:
562
+ normalized = normalize_http_url(url, base_url=self.base_url)
563
+ if not normalized:
564
+ return None
565
+ linked_hash = self.generate_hash_id(normalized)
566
+ asset_type = self._asset_type_from_mime_or_url("", normalized)
567
+ metadata = {
568
+ "url": normalized,
569
+ "referenced_by": page_hash,
570
+ }
571
+ return SingleAssetScanResults(
572
+ hash=linked_hash,
573
+ checksum=self.calculate_checksum(metadata),
574
+ name=self._display_name_from_url(normalized),
575
+ external_url=normalized,
576
+ links=[],
577
+ asset_type=asset_type,
578
+ source_id=self.source_id,
579
+ created_at=now,
580
+ updated_at=now,
581
+ runner_id=self.runner_id,
582
+ )
583
+
584
+ def _display_name_from_url(self, url: str) -> str:
585
+ parsed = urlsplit(url)
586
+ file_name = parsed.path.rstrip("/").split("/")[-1]
587
+ return file_name or parsed.netloc
588
+
589
+ def _asset_type_from_mime_or_url(
590
+ self,
591
+ mime_type: str,
592
+ url: str,
593
+ ) -> OutputAssetType:
594
+ normalized_mime = (mime_type or "").lower()
595
+ if normalized_mime.startswith("image/"):
596
+ return OutputAssetType.IMAGE
597
+ if normalized_mime.startswith("video/"):
598
+ return OutputAssetType.VIDEO
599
+ if normalized_mime.startswith("audio/"):
600
+ return OutputAssetType.AUDIO
601
+ if is_tabular_mime_type(normalized_mime):
602
+ return OutputAssetType.TABLE
603
+ if normalized_mime in {
604
+ "text/plain",
605
+ "application/json",
606
+ "application/xml",
607
+ "text/xml",
608
+ }:
609
+ return OutputAssetType.TXT
610
+ if normalized_mime == "text/html":
611
+ return OutputAssetType.URL
612
+
613
+ lower_path = urlsplit(url).path.lower()
614
+ for extension, asset_type in FILE_EXTENSION_HINTS.items():
615
+ if lower_path.endswith(extension):
616
+ return asset_type
617
+ return OutputAssetType.BINARY
618
+
619
+ def _add_asset_if_new(
620
+ self,
621
+ assets: list[SingleAssetScanResults],
622
+ asset: SingleAssetScanResults,
623
+ ) -> bool:
624
+ if asset.hash in self._seen_asset_hashes:
625
+ return False
626
+ self._seen_asset_hashes.add(asset.hash)
627
+ assets.append(asset)
628
+ return True
629
+
630
+ async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
631
+ normalized = normalize_http_url(asset_id, base_url=self.base_url)
632
+ if normalized:
633
+ asset_id = self.generate_hash_id(normalized)
634
+
635
+ download_url = self._attachment_download_url_by_hash.get(asset_id)
636
+ if not download_url:
637
+ mapped = self._hash_to_url.get(asset_id)
638
+ if mapped:
639
+ download_url = mapped
640
+ if not download_url:
641
+ return None
642
+
643
+ try:
644
+ file_bytes, declared_mime = self.client.get_bytes(download_url)
645
+ except Exception as exc:
646
+ logger.warning("Failed to fetch attachment bytes for %s: %s", download_url, exc)
647
+ return None
648
+
649
+ if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
650
+ file_bytes = file_bytes[: self.attachment_max_bytes]
651
+
652
+ mime_type = resolve_mime_type(
653
+ file_bytes,
654
+ declared_mime_type=declared_mime,
655
+ file_name=self._attachment_file_name(asset_id, download_url),
656
+ )
657
+ return file_bytes, mime_type
658
+
659
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
660
+ direct = self._asset_content_cache.get(asset_id)
661
+ if direct:
662
+ return direct
663
+
664
+ if asset_id in self._page_content_cache:
665
+ return self._page_content_cache[asset_id]
666
+
667
+ normalized = normalize_http_url(asset_id, base_url=self.base_url)
668
+ if normalized:
669
+ asset_hash = self.generate_hash_id(normalized)
670
+ if asset_hash in self._page_content_cache:
671
+ return self._page_content_cache[asset_hash]
672
+ if asset_hash in self._asset_content_cache:
673
+ return self._asset_content_cache[asset_hash]
674
+ asset_id = asset_hash
675
+
676
+ download_url = self._attachment_download_url_by_hash.get(asset_id)
677
+ if not download_url:
678
+ mapped = self._hash_to_url.get(asset_id)
679
+ if mapped:
680
+ download_url = mapped
681
+ if not download_url:
682
+ return None
683
+
684
+ try:
685
+ file_bytes, declared_mime = self.client.get_bytes(download_url)
686
+ except Exception as exc:
687
+ logger.warning("Failed to fetch attachment content for %s: %s", download_url, exc)
688
+ return None
689
+
690
+ if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
691
+ file_bytes = file_bytes[: self.attachment_max_bytes]
692
+
693
+ parsed = self.parse_asset_bytes(
694
+ file_bytes,
695
+ declared_mime_type=declared_mime,
696
+ file_name=self._attachment_file_name(asset_id, download_url),
697
+ )
698
+
699
+ if parsed.text_content:
700
+ self._asset_content_cache[asset_id] = (parsed.raw_content, parsed.text_content)
701
+ return parsed.raw_content, parsed.text_content
702
+ return None
703
+
704
+ def generate_hash_id(self, asset_id: str) -> str:
705
+ normalized = normalize_http_url(asset_id, base_url=self.base_url)
706
+ if not normalized:
707
+ raise ValueError(f"Invalid URL for hash: {asset_id}")
708
+ asset_hash = hash_url(normalized, base_url=self.base_url)
709
+ self._hash_to_url[asset_hash] = normalized
710
+ return asset_hash
711
+
712
+ def resolve_link_for_detection(self, link: str) -> str | None:
713
+ mapped = self._hash_to_url.get(link)
714
+ if mapped:
715
+ return mapped
716
+ return normalize_http_url(link)
717
+
718
+ def enrich_finding_location(
719
+ self,
720
+ finding: DetectionResult,
721
+ asset: SingleAssetScanResults,
722
+ text_content: str,
723
+ ) -> None:
724
+ _ = text_content
725
+ finding.location = Location(path=asset.external_url)
726
+
727
+ def abort(self) -> None:
728
+ logger.info("Aborting Confluence extraction...")
729
+ super().abort()
730
+ self.client.close()
731
+
732
+ def cleanup(self) -> None:
733
+ self.client.close()