classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections.abc import AsyncGenerator
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
from urllib.parse import urlsplit
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
|
|
9
|
+
from ...models.generated_input import (
|
|
10
|
+
ConfluenceInput,
|
|
11
|
+
ConfluenceOptional,
|
|
12
|
+
ConfluenceOptionalConnection,
|
|
13
|
+
ConfluenceOptionalContent,
|
|
14
|
+
ConfluenceOptionalScope,
|
|
15
|
+
SamplingStrategy,
|
|
16
|
+
)
|
|
17
|
+
from ...models.generated_single_asset_scan_results import (
|
|
18
|
+
AssetType as OutputAssetType,
|
|
19
|
+
)
|
|
20
|
+
from ...models.generated_single_asset_scan_results import (
|
|
21
|
+
DetectionResult,
|
|
22
|
+
Location,
|
|
23
|
+
SingleAssetScanResults,
|
|
24
|
+
)
|
|
25
|
+
from ...utils.content_extraction import html_to_text
|
|
26
|
+
from ...utils.file_parser import resolve_mime_type
|
|
27
|
+
from ...utils.hashing import hash_url, normalize_http_url
|
|
28
|
+
from ..atlassian_common import (
|
|
29
|
+
AtlassianCloudClient,
|
|
30
|
+
dedupe_preserve_order,
|
|
31
|
+
deterministic_sample,
|
|
32
|
+
extract_urls_from_text,
|
|
33
|
+
is_tabular_mime_type,
|
|
34
|
+
looks_like_file_asset,
|
|
35
|
+
normalize_atlassian_base_url,
|
|
36
|
+
parse_atlassian_document,
|
|
37
|
+
parse_datetime,
|
|
38
|
+
)
|
|
39
|
+
from ..base import BaseSource
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
FILE_EXTENSION_HINTS: dict[str, OutputAssetType] = {
|
|
44
|
+
".png": OutputAssetType.IMAGE,
|
|
45
|
+
".jpg": OutputAssetType.IMAGE,
|
|
46
|
+
".jpeg": OutputAssetType.IMAGE,
|
|
47
|
+
".gif": OutputAssetType.IMAGE,
|
|
48
|
+
".webp": OutputAssetType.IMAGE,
|
|
49
|
+
".svg": OutputAssetType.IMAGE,
|
|
50
|
+
".bmp": OutputAssetType.IMAGE,
|
|
51
|
+
".ico": OutputAssetType.IMAGE,
|
|
52
|
+
".mp4": OutputAssetType.VIDEO,
|
|
53
|
+
".webm": OutputAssetType.VIDEO,
|
|
54
|
+
".mov": OutputAssetType.VIDEO,
|
|
55
|
+
".mkv": OutputAssetType.VIDEO,
|
|
56
|
+
".avi": OutputAssetType.VIDEO,
|
|
57
|
+
".mp3": OutputAssetType.AUDIO,
|
|
58
|
+
".wav": OutputAssetType.AUDIO,
|
|
59
|
+
".aac": OutputAssetType.AUDIO,
|
|
60
|
+
".ogg": OutputAssetType.AUDIO,
|
|
61
|
+
".pdf": OutputAssetType.BINARY,
|
|
62
|
+
".doc": OutputAssetType.BINARY,
|
|
63
|
+
".docx": OutputAssetType.BINARY,
|
|
64
|
+
".xls": OutputAssetType.TABLE,
|
|
65
|
+
".xlsx": OutputAssetType.TABLE,
|
|
66
|
+
".ppt": OutputAssetType.BINARY,
|
|
67
|
+
".pptx": OutputAssetType.BINARY,
|
|
68
|
+
".zip": OutputAssetType.BINARY,
|
|
69
|
+
".rar": OutputAssetType.BINARY,
|
|
70
|
+
".7z": OutputAssetType.BINARY,
|
|
71
|
+
".tar": OutputAssetType.BINARY,
|
|
72
|
+
".gz": OutputAssetType.BINARY,
|
|
73
|
+
".parquet": OutputAssetType.TABLE,
|
|
74
|
+
".json": OutputAssetType.TXT,
|
|
75
|
+
".xml": OutputAssetType.TXT,
|
|
76
|
+
".txt": OutputAssetType.TXT,
|
|
77
|
+
".csv": OutputAssetType.TABLE,
|
|
78
|
+
".tsv": OutputAssetType.TABLE,
|
|
79
|
+
".md": OutputAssetType.TXT,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ConfluenceSource(BaseSource):
|
|
84
|
+
source_type = "confluence"
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
recipe: dict[str, Any],
|
|
89
|
+
source_id: str | None = None,
|
|
90
|
+
runner_id: str | None = None,
|
|
91
|
+
):
|
|
92
|
+
super().__init__(recipe, source_id=source_id, runner_id=runner_id)
|
|
93
|
+
self.config = ConfluenceInput.model_validate(recipe)
|
|
94
|
+
self.runner_id = runner_id or "local-run"
|
|
95
|
+
|
|
96
|
+
self.base_url = normalize_atlassian_base_url(
|
|
97
|
+
str(self.config.required.base_url),
|
|
98
|
+
strip_wiki=True,
|
|
99
|
+
)
|
|
100
|
+
self.api_base = f"{self.base_url}/wiki/api/v2"
|
|
101
|
+
|
|
102
|
+
connection = self._connection_options()
|
|
103
|
+
self.client = AtlassianCloudClient(
|
|
104
|
+
base_url=self.base_url,
|
|
105
|
+
account_email=str(self.config.required.account_email),
|
|
106
|
+
api_token=self.config.masked.api_token,
|
|
107
|
+
request_timeout_seconds=float(connection.request_timeout_seconds or 30),
|
|
108
|
+
max_retries=int(connection.max_retries or 3),
|
|
109
|
+
rate_limit_delay_seconds=float(connection.rate_limit_delay_seconds or 0),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
content_options = self._content_options()
|
|
113
|
+
self.include_footer_comments = content_options.include_footer_comments is not False
|
|
114
|
+
self.include_inline_comments = content_options.include_inline_comments is not False
|
|
115
|
+
self.include_attachments = content_options.include_attachments is not False
|
|
116
|
+
self.include_linked_file_assets = content_options.include_linked_file_assets is not False
|
|
117
|
+
self.attachment_max_bytes = int(content_options.attachment_max_bytes or 5_242_880)
|
|
118
|
+
|
|
119
|
+
self._seen_asset_hashes: set[str] = set()
|
|
120
|
+
self._hash_to_url: dict[str, str] = {}
|
|
121
|
+
self._page_content_cache: dict[str, tuple[str, str]] = {}
|
|
122
|
+
self._asset_content_cache: dict[str, tuple[str, str]] = {}
|
|
123
|
+
self._attachment_download_url_by_hash: dict[str, str] = {}
|
|
124
|
+
|
|
125
|
+
def _optional(self) -> ConfluenceOptional:
|
|
126
|
+
if self.config.optional:
|
|
127
|
+
return self.config.optional
|
|
128
|
+
return ConfluenceOptional()
|
|
129
|
+
|
|
130
|
+
def _connection_options(self) -> ConfluenceOptionalConnection:
|
|
131
|
+
optional = self._optional()
|
|
132
|
+
if optional.connection:
|
|
133
|
+
return optional.connection
|
|
134
|
+
return ConfluenceOptionalConnection()
|
|
135
|
+
|
|
136
|
+
def _scope_options(self) -> ConfluenceOptionalScope:
|
|
137
|
+
optional = self._optional()
|
|
138
|
+
if optional.scope:
|
|
139
|
+
return optional.scope
|
|
140
|
+
return ConfluenceOptionalScope()
|
|
141
|
+
|
|
142
|
+
def _content_options(self) -> ConfluenceOptionalContent:
|
|
143
|
+
optional = self._optional()
|
|
144
|
+
if optional.content:
|
|
145
|
+
return optional.content
|
|
146
|
+
return ConfluenceOptionalContent()
|
|
147
|
+
|
|
148
|
+
def test_connection(self) -> dict[str, Any]:
|
|
149
|
+
result = {
|
|
150
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
151
|
+
"source_type": self.recipe.get("type"),
|
|
152
|
+
}
|
|
153
|
+
try:
|
|
154
|
+
self.client.get_json("/wiki/api/v2/spaces", params={"limit": 1})
|
|
155
|
+
result["status"] = "SUCCESS"
|
|
156
|
+
result["message"] = "Successfully connected to Confluence Cloud API."
|
|
157
|
+
except Exception as exc:
|
|
158
|
+
result["status"] = "FAILURE"
|
|
159
|
+
result["message"] = f"Failed to connect to Confluence Cloud API: {exc}"
|
|
160
|
+
return result
|
|
161
|
+
|
|
162
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
163
|
+
if self._aborted:
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
self._reset_runtime_state()
|
|
167
|
+
|
|
168
|
+
page_refs = self._discover_page_refs()
|
|
169
|
+
sampled_refs = self._sample_page_refs(page_refs)
|
|
170
|
+
|
|
171
|
+
pending_batch: list[SingleAssetScanResults] = []
|
|
172
|
+
for page_ref in sampled_refs:
|
|
173
|
+
if self._aborted:
|
|
174
|
+
break
|
|
175
|
+
|
|
176
|
+
assets = self._extract_page_assets(page_ref)
|
|
177
|
+
for asset in assets:
|
|
178
|
+
if not self._add_asset_if_new(pending_batch, asset):
|
|
179
|
+
continue
|
|
180
|
+
while len(pending_batch) >= self.BATCH_SIZE:
|
|
181
|
+
to_emit = pending_batch[: self.BATCH_SIZE]
|
|
182
|
+
pending_batch = pending_batch[self.BATCH_SIZE :]
|
|
183
|
+
if to_emit:
|
|
184
|
+
yield to_emit
|
|
185
|
+
|
|
186
|
+
if pending_batch:
|
|
187
|
+
yield pending_batch
|
|
188
|
+
|
|
189
|
+
def _reset_runtime_state(self) -> None:
|
|
190
|
+
self._seen_asset_hashes = set()
|
|
191
|
+
self._hash_to_url = {}
|
|
192
|
+
self._page_content_cache = {}
|
|
193
|
+
self._asset_content_cache = {}
|
|
194
|
+
self._attachment_download_url_by_hash = {}
|
|
195
|
+
self._attachment_name_by_hash = {}
|
|
196
|
+
|
|
197
|
+
def _discover_page_refs(self) -> list[dict[str, Any]]:
|
|
198
|
+
refs: list[dict[str, Any]] = []
|
|
199
|
+
spaces = self._fetch_spaces()
|
|
200
|
+
for space in spaces:
|
|
201
|
+
if self._aborted:
|
|
202
|
+
break
|
|
203
|
+
space_id = str(space.get("id") or "")
|
|
204
|
+
if not space_id:
|
|
205
|
+
continue
|
|
206
|
+
page_results = self.client.iter_confluence_results(
|
|
207
|
+
f"/wiki/api/v2/spaces/{space_id}/pages",
|
|
208
|
+
params={"limit": 250, "body-format": "storage"},
|
|
209
|
+
)
|
|
210
|
+
for page in page_results:
|
|
211
|
+
page_id = str(page.get("id") or "")
|
|
212
|
+
if not page_id:
|
|
213
|
+
continue
|
|
214
|
+
refs.append(
|
|
215
|
+
{
|
|
216
|
+
"space_id": space_id,
|
|
217
|
+
"space": space,
|
|
218
|
+
"page_id": page_id,
|
|
219
|
+
"title": page.get("title"),
|
|
220
|
+
"created_at": page.get("createdAt"),
|
|
221
|
+
"version_created_at": (
|
|
222
|
+
page.get("version", {}).get("createdAt")
|
|
223
|
+
if isinstance(page.get("version"), dict)
|
|
224
|
+
else None
|
|
225
|
+
),
|
|
226
|
+
}
|
|
227
|
+
)
|
|
228
|
+
return refs
|
|
229
|
+
|
|
230
|
+
def _fetch_spaces(self) -> list[dict[str, Any]]:
|
|
231
|
+
params: dict[str, Any] = {"limit": 250}
|
|
232
|
+
spaces_filter = self._scope_options().spaces
|
|
233
|
+
if spaces_filter:
|
|
234
|
+
if spaces_filter.ids:
|
|
235
|
+
params["ids"] = ",".join(str(v) for v in spaces_filter.ids)
|
|
236
|
+
if spaces_filter.keys:
|
|
237
|
+
params["keys"] = ",".join(str(v) for v in spaces_filter.keys)
|
|
238
|
+
if spaces_filter.type:
|
|
239
|
+
params["type"] = str(spaces_filter.type.value)
|
|
240
|
+
if spaces_filter.status:
|
|
241
|
+
params["status"] = str(spaces_filter.status.value)
|
|
242
|
+
if spaces_filter.labels:
|
|
243
|
+
params["labels"] = ",".join(str(v) for v in spaces_filter.labels)
|
|
244
|
+
return self.client.iter_confluence_results("/wiki/api/v2/spaces", params=params)
|
|
245
|
+
|
|
246
|
+
def _sample_page_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
247
|
+
sampling = self.config.sampling
|
|
248
|
+
if sampling.strategy == SamplingStrategy.ALL:
|
|
249
|
+
return refs
|
|
250
|
+
|
|
251
|
+
limit = int(sampling.rows_per_page or 100)
|
|
252
|
+
if limit >= len(refs):
|
|
253
|
+
return refs
|
|
254
|
+
|
|
255
|
+
if sampling.strategy == SamplingStrategy.RANDOM:
|
|
256
|
+
return deterministic_sample(refs, limit)
|
|
257
|
+
|
|
258
|
+
refs_sorted = sorted(
|
|
259
|
+
refs,
|
|
260
|
+
key=lambda ref: parse_datetime(
|
|
261
|
+
str(ref.get("version_created_at") or ref.get("created_at") or "")
|
|
262
|
+
),
|
|
263
|
+
reverse=True,
|
|
264
|
+
)
|
|
265
|
+
return refs_sorted[:limit]
|
|
266
|
+
|
|
267
|
+
def _extract_page_assets(self, ref: dict[str, Any]) -> list[SingleAssetScanResults]:
|
|
268
|
+
page_id = str(ref["page_id"])
|
|
269
|
+
page = self.client.get_json(
|
|
270
|
+
f"/wiki/api/v2/pages/{page_id}",
|
|
271
|
+
params={"body-format": "storage"},
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
now = datetime.now(UTC)
|
|
275
|
+
title = str(page.get("title") or f"Confluence Page {page_id}")
|
|
276
|
+
page_url = self._page_url(page, page_id)
|
|
277
|
+
page_hash = self.generate_hash_id(page_url)
|
|
278
|
+
body_storage = self._extract_storage_html(page.get("body"))
|
|
279
|
+
body_text = html_to_text(body_storage)
|
|
280
|
+
self._page_content_cache[page_hash] = (body_storage, body_text)
|
|
281
|
+
|
|
282
|
+
related_assets: list[SingleAssetScanResults] = []
|
|
283
|
+
related_hashes: list[str] = []
|
|
284
|
+
|
|
285
|
+
body_links = self._extract_urls_from_html(body_storage)
|
|
286
|
+
body_link_hashes = [self.generate_hash_id(link) for link in body_links]
|
|
287
|
+
related_hashes.extend(body_link_hashes)
|
|
288
|
+
|
|
289
|
+
if self.include_linked_file_assets:
|
|
290
|
+
for link in body_links:
|
|
291
|
+
if not looks_like_file_asset(link):
|
|
292
|
+
continue
|
|
293
|
+
file_asset = self._make_linked_file_asset(link, page_hash, now)
|
|
294
|
+
if file_asset:
|
|
295
|
+
related_assets.append(file_asset)
|
|
296
|
+
related_hashes.append(file_asset.hash)
|
|
297
|
+
|
|
298
|
+
if self.include_attachments:
|
|
299
|
+
attachment_assets, attachment_hashes = self._extract_attachment_assets(
|
|
300
|
+
page_id, page_hash, now
|
|
301
|
+
)
|
|
302
|
+
related_assets.extend(attachment_assets)
|
|
303
|
+
related_hashes.extend(attachment_hashes)
|
|
304
|
+
|
|
305
|
+
comments_asset, comment_hashes = self._extract_comments_asset(page_id, page_url, now)
|
|
306
|
+
if comments_asset is not None:
|
|
307
|
+
related_assets.append(comments_asset)
|
|
308
|
+
related_hashes.extend(comment_hashes)
|
|
309
|
+
|
|
310
|
+
page_metadata = {
|
|
311
|
+
"page_id": page_id,
|
|
312
|
+
"space_id": page.get("spaceId"),
|
|
313
|
+
"title": title,
|
|
314
|
+
"status": page.get("status"),
|
|
315
|
+
"links_count": len(related_hashes),
|
|
316
|
+
}
|
|
317
|
+
page_asset = SingleAssetScanResults(
|
|
318
|
+
hash=page_hash,
|
|
319
|
+
checksum=self.calculate_checksum(page_metadata),
|
|
320
|
+
name=title,
|
|
321
|
+
external_url=page_url,
|
|
322
|
+
links=dedupe_preserve_order(related_hashes),
|
|
323
|
+
asset_type=OutputAssetType.URL,
|
|
324
|
+
source_id=self.source_id,
|
|
325
|
+
created_at=parse_datetime(str(page.get("createdAt") or "")),
|
|
326
|
+
updated_at=parse_datetime(
|
|
327
|
+
str(
|
|
328
|
+
page.get("version", {}).get("createdAt")
|
|
329
|
+
if isinstance(page.get("version"), dict)
|
|
330
|
+
else ""
|
|
331
|
+
)
|
|
332
|
+
),
|
|
333
|
+
runner_id=self.runner_id,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return [page_asset, *related_assets]
|
|
337
|
+
|
|
338
|
+
def _extract_attachment_assets(
|
|
339
|
+
self,
|
|
340
|
+
page_id: str,
|
|
341
|
+
page_hash: str,
|
|
342
|
+
now: datetime,
|
|
343
|
+
) -> tuple[list[SingleAssetScanResults], list[str]]:
|
|
344
|
+
assets: list[SingleAssetScanResults] = []
|
|
345
|
+
hashes: list[str] = []
|
|
346
|
+
attachments = self.client.iter_confluence_results(
|
|
347
|
+
f"/wiki/api/v2/pages/{page_id}/attachments",
|
|
348
|
+
params={"limit": 250},
|
|
349
|
+
)
|
|
350
|
+
for attachment in attachments:
|
|
351
|
+
attachment_url = self._attachment_url(attachment)
|
|
352
|
+
if not attachment_url:
|
|
353
|
+
continue
|
|
354
|
+
|
|
355
|
+
attachment_hash = self.generate_hash_id(attachment_url)
|
|
356
|
+
attachment_name = str(attachment.get("title") or f"Attachment {attachment.get('id')}")
|
|
357
|
+
self._attachment_name_by_hash[attachment_hash] = attachment_name
|
|
358
|
+
mime = str(attachment.get("mediaType") or "").lower()
|
|
359
|
+
asset_type = self._asset_type_from_mime_or_url(mime, attachment_url)
|
|
360
|
+
metadata = {
|
|
361
|
+
"attachment_id": attachment.get("id"),
|
|
362
|
+
"title": attachment_name,
|
|
363
|
+
"media_type": mime,
|
|
364
|
+
"file_size": attachment.get("fileSize"),
|
|
365
|
+
"page_hash": page_hash,
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
download_url = self._attachment_download_url(attachment)
|
|
369
|
+
if download_url:
|
|
370
|
+
self._attachment_download_url_by_hash[attachment_hash] = download_url
|
|
371
|
+
|
|
372
|
+
assets.append(
|
|
373
|
+
SingleAssetScanResults(
|
|
374
|
+
hash=attachment_hash,
|
|
375
|
+
checksum=self.calculate_checksum(metadata),
|
|
376
|
+
name=attachment_name,
|
|
377
|
+
external_url=attachment_url,
|
|
378
|
+
links=[],
|
|
379
|
+
asset_type=asset_type,
|
|
380
|
+
source_id=self.source_id,
|
|
381
|
+
created_at=now,
|
|
382
|
+
updated_at=now,
|
|
383
|
+
runner_id=self.runner_id,
|
|
384
|
+
)
|
|
385
|
+
)
|
|
386
|
+
hashes.append(attachment_hash)
|
|
387
|
+
return assets, hashes
|
|
388
|
+
|
|
389
|
+
def _extract_comments_asset(
|
|
390
|
+
self,
|
|
391
|
+
page_id: str,
|
|
392
|
+
page_url: str,
|
|
393
|
+
now: datetime,
|
|
394
|
+
) -> tuple[SingleAssetScanResults | None, list[str]]:
|
|
395
|
+
comment_items: list[dict[str, Any]] = []
|
|
396
|
+
if self.include_footer_comments:
|
|
397
|
+
comment_items.extend(
|
|
398
|
+
self.client.iter_confluence_results(
|
|
399
|
+
f"/wiki/api/v2/pages/{page_id}/footer-comments",
|
|
400
|
+
params={"limit": 250, "body-format": "storage"},
|
|
401
|
+
)
|
|
402
|
+
)
|
|
403
|
+
if self.include_inline_comments:
|
|
404
|
+
comment_items.extend(
|
|
405
|
+
self.client.iter_confluence_results(
|
|
406
|
+
f"/wiki/api/v2/pages/{page_id}/inline-comments",
|
|
407
|
+
params={"limit": 250, "body-format": "storage"},
|
|
408
|
+
)
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
if not comment_items:
|
|
412
|
+
return None, []
|
|
413
|
+
|
|
414
|
+
text_blocks: list[str] = []
|
|
415
|
+
comment_urls: list[str] = []
|
|
416
|
+
for comment in comment_items:
|
|
417
|
+
text, urls = self._comment_text_and_urls(comment)
|
|
418
|
+
if text:
|
|
419
|
+
text_blocks.append(text)
|
|
420
|
+
comment_urls.extend(urls)
|
|
421
|
+
|
|
422
|
+
combined_text = "\n\n".join(text_blocks).strip()
|
|
423
|
+
if not combined_text:
|
|
424
|
+
return None, []
|
|
425
|
+
|
|
426
|
+
# Keep a distinct, URL-stable comments asset identifier (fragments are stripped in URL normalization).
|
|
427
|
+
comments_url = f"{page_url}?view=comments"
|
|
428
|
+
comments_hash = self.generate_hash_id(comments_url)
|
|
429
|
+
self._asset_content_cache[comments_hash] = (combined_text, combined_text)
|
|
430
|
+
|
|
431
|
+
comment_link_hashes = [
|
|
432
|
+
self.generate_hash_id(normalized)
|
|
433
|
+
for normalized in (
|
|
434
|
+
normalize_http_url(url, base_url=self.base_url)
|
|
435
|
+
for url in dedupe_preserve_order(comment_urls)
|
|
436
|
+
)
|
|
437
|
+
if normalized
|
|
438
|
+
]
|
|
439
|
+
|
|
440
|
+
comments_asset = SingleAssetScanResults(
|
|
441
|
+
hash=comments_hash,
|
|
442
|
+
checksum=self.calculate_checksum(
|
|
443
|
+
{
|
|
444
|
+
"page_id": page_id,
|
|
445
|
+
"comments_count": len(comment_items),
|
|
446
|
+
"text_length": len(combined_text),
|
|
447
|
+
}
|
|
448
|
+
),
|
|
449
|
+
name=f"Comments for page {page_id}",
|
|
450
|
+
external_url=comments_url,
|
|
451
|
+
links=comment_link_hashes,
|
|
452
|
+
asset_type=OutputAssetType.TXT,
|
|
453
|
+
source_id=self.source_id,
|
|
454
|
+
created_at=now,
|
|
455
|
+
updated_at=now,
|
|
456
|
+
runner_id=self.runner_id,
|
|
457
|
+
)
|
|
458
|
+
return comments_asset, [comments_hash]
|
|
459
|
+
|
|
460
|
+
def _comment_text_and_urls(self, comment: dict[str, Any]) -> tuple[str, list[str]]:
|
|
461
|
+
body = comment.get("body")
|
|
462
|
+
if not isinstance(body, dict):
|
|
463
|
+
return "", []
|
|
464
|
+
|
|
465
|
+
for body_key in ("storage", "view"):
|
|
466
|
+
candidate = body.get(body_key)
|
|
467
|
+
if isinstance(candidate, dict):
|
|
468
|
+
value = candidate.get("value")
|
|
469
|
+
if isinstance(value, str) and value:
|
|
470
|
+
soup = BeautifulSoup(value, "html.parser")
|
|
471
|
+
urls = list(self._extract_urls_from_html(value))
|
|
472
|
+
text = html_to_text(value)
|
|
473
|
+
if text:
|
|
474
|
+
return text, urls + extract_urls_from_text(soup.get_text(" "))
|
|
475
|
+
|
|
476
|
+
atlas = body.get("atlas_doc_format")
|
|
477
|
+
if isinstance(atlas, dict):
|
|
478
|
+
return parse_atlassian_document(atlas)
|
|
479
|
+
if isinstance(atlas, str):
|
|
480
|
+
try:
|
|
481
|
+
parsed = parse_atlassian_document(atlas)
|
|
482
|
+
except Exception:
|
|
483
|
+
parsed = (atlas, extract_urls_from_text(atlas))
|
|
484
|
+
return parsed
|
|
485
|
+
|
|
486
|
+
return parse_atlassian_document(body)
|
|
487
|
+
|
|
488
|
+
def _extract_storage_html(self, body: Any) -> str:
|
|
489
|
+
if not isinstance(body, dict):
|
|
490
|
+
return ""
|
|
491
|
+
storage = body.get("storage")
|
|
492
|
+
if not isinstance(storage, dict):
|
|
493
|
+
return ""
|
|
494
|
+
value = storage.get("value")
|
|
495
|
+
if isinstance(value, str):
|
|
496
|
+
return value
|
|
497
|
+
return ""
|
|
498
|
+
|
|
499
|
+
def _extract_urls_from_html(self, html_content: str) -> list[str]:
|
|
500
|
+
if not html_content:
|
|
501
|
+
return []
|
|
502
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
503
|
+
links: list[str] = []
|
|
504
|
+
for tag in soup.find_all(["a", "img", "source"]):
|
|
505
|
+
for key in ("href", "src"):
|
|
506
|
+
raw = tag.get(key)
|
|
507
|
+
if not isinstance(raw, str):
|
|
508
|
+
continue
|
|
509
|
+
normalized = normalize_http_url(raw, base_url=self.base_url)
|
|
510
|
+
if normalized:
|
|
511
|
+
links.append(normalized)
|
|
512
|
+
return dedupe_preserve_order(links)
|
|
513
|
+
|
|
514
|
+
def _page_url(self, page: dict[str, Any], page_id: str) -> str:
|
|
515
|
+
links = page.get("_links")
|
|
516
|
+
if isinstance(links, dict):
|
|
517
|
+
webui = links.get("webui")
|
|
518
|
+
if isinstance(webui, str) and webui:
|
|
519
|
+
normalized = normalize_http_url(webui, base_url=self.base_url)
|
|
520
|
+
if normalized:
|
|
521
|
+
return normalized
|
|
522
|
+
return f"{self.base_url}/wiki/pages/{page_id}"
|
|
523
|
+
|
|
524
|
+
def _attachment_url(self, attachment: dict[str, Any]) -> str | None:
|
|
525
|
+
for key in ("downloadLink", "webuiLink"):
|
|
526
|
+
value = attachment.get(key)
|
|
527
|
+
if isinstance(value, str):
|
|
528
|
+
normalized = normalize_http_url(value, base_url=self.base_url)
|
|
529
|
+
if normalized:
|
|
530
|
+
return normalized
|
|
531
|
+
links = attachment.get("_links")
|
|
532
|
+
if isinstance(links, dict):
|
|
533
|
+
for key in ("download", "webui"):
|
|
534
|
+
value = links.get(key)
|
|
535
|
+
if isinstance(value, str):
|
|
536
|
+
normalized = normalize_http_url(value, base_url=self.base_url)
|
|
537
|
+
if normalized:
|
|
538
|
+
return normalized
|
|
539
|
+
return None
|
|
540
|
+
|
|
541
|
+
def _attachment_download_url(self, attachment: dict[str, Any]) -> str | None:
|
|
542
|
+
value = attachment.get("downloadLink")
|
|
543
|
+
if isinstance(value, str):
|
|
544
|
+
normalized = normalize_http_url(value, base_url=self.base_url)
|
|
545
|
+
if normalized:
|
|
546
|
+
return normalized
|
|
547
|
+
links = attachment.get("_links")
|
|
548
|
+
if isinstance(links, dict):
|
|
549
|
+
link_value = links.get("download")
|
|
550
|
+
if isinstance(link_value, str):
|
|
551
|
+
normalized = normalize_http_url(link_value, base_url=self.base_url)
|
|
552
|
+
if normalized:
|
|
553
|
+
return normalized
|
|
554
|
+
return None
|
|
555
|
+
|
|
556
|
+
def _make_linked_file_asset(
|
|
557
|
+
self,
|
|
558
|
+
url: str,
|
|
559
|
+
page_hash: str,
|
|
560
|
+
now: datetime,
|
|
561
|
+
) -> SingleAssetScanResults | None:
|
|
562
|
+
normalized = normalize_http_url(url, base_url=self.base_url)
|
|
563
|
+
if not normalized:
|
|
564
|
+
return None
|
|
565
|
+
linked_hash = self.generate_hash_id(normalized)
|
|
566
|
+
asset_type = self._asset_type_from_mime_or_url("", normalized)
|
|
567
|
+
metadata = {
|
|
568
|
+
"url": normalized,
|
|
569
|
+
"referenced_by": page_hash,
|
|
570
|
+
}
|
|
571
|
+
return SingleAssetScanResults(
|
|
572
|
+
hash=linked_hash,
|
|
573
|
+
checksum=self.calculate_checksum(metadata),
|
|
574
|
+
name=self._display_name_from_url(normalized),
|
|
575
|
+
external_url=normalized,
|
|
576
|
+
links=[],
|
|
577
|
+
asset_type=asset_type,
|
|
578
|
+
source_id=self.source_id,
|
|
579
|
+
created_at=now,
|
|
580
|
+
updated_at=now,
|
|
581
|
+
runner_id=self.runner_id,
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
def _display_name_from_url(self, url: str) -> str:
|
|
585
|
+
parsed = urlsplit(url)
|
|
586
|
+
file_name = parsed.path.rstrip("/").split("/")[-1]
|
|
587
|
+
return file_name or parsed.netloc
|
|
588
|
+
|
|
589
|
+
def _asset_type_from_mime_or_url(
|
|
590
|
+
self,
|
|
591
|
+
mime_type: str,
|
|
592
|
+
url: str,
|
|
593
|
+
) -> OutputAssetType:
|
|
594
|
+
normalized_mime = (mime_type or "").lower()
|
|
595
|
+
if normalized_mime.startswith("image/"):
|
|
596
|
+
return OutputAssetType.IMAGE
|
|
597
|
+
if normalized_mime.startswith("video/"):
|
|
598
|
+
return OutputAssetType.VIDEO
|
|
599
|
+
if normalized_mime.startswith("audio/"):
|
|
600
|
+
return OutputAssetType.AUDIO
|
|
601
|
+
if is_tabular_mime_type(normalized_mime):
|
|
602
|
+
return OutputAssetType.TABLE
|
|
603
|
+
if normalized_mime in {
|
|
604
|
+
"text/plain",
|
|
605
|
+
"application/json",
|
|
606
|
+
"application/xml",
|
|
607
|
+
"text/xml",
|
|
608
|
+
}:
|
|
609
|
+
return OutputAssetType.TXT
|
|
610
|
+
if normalized_mime == "text/html":
|
|
611
|
+
return OutputAssetType.URL
|
|
612
|
+
|
|
613
|
+
lower_path = urlsplit(url).path.lower()
|
|
614
|
+
for extension, asset_type in FILE_EXTENSION_HINTS.items():
|
|
615
|
+
if lower_path.endswith(extension):
|
|
616
|
+
return asset_type
|
|
617
|
+
return OutputAssetType.BINARY
|
|
618
|
+
|
|
619
|
+
def _add_asset_if_new(
|
|
620
|
+
self,
|
|
621
|
+
assets: list[SingleAssetScanResults],
|
|
622
|
+
asset: SingleAssetScanResults,
|
|
623
|
+
) -> bool:
|
|
624
|
+
if asset.hash in self._seen_asset_hashes:
|
|
625
|
+
return False
|
|
626
|
+
self._seen_asset_hashes.add(asset.hash)
|
|
627
|
+
assets.append(asset)
|
|
628
|
+
return True
|
|
629
|
+
|
|
630
|
+
async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
|
|
631
|
+
normalized = normalize_http_url(asset_id, base_url=self.base_url)
|
|
632
|
+
if normalized:
|
|
633
|
+
asset_id = self.generate_hash_id(normalized)
|
|
634
|
+
|
|
635
|
+
download_url = self._attachment_download_url_by_hash.get(asset_id)
|
|
636
|
+
if not download_url:
|
|
637
|
+
mapped = self._hash_to_url.get(asset_id)
|
|
638
|
+
if mapped:
|
|
639
|
+
download_url = mapped
|
|
640
|
+
if not download_url:
|
|
641
|
+
return None
|
|
642
|
+
|
|
643
|
+
try:
|
|
644
|
+
file_bytes, declared_mime = self.client.get_bytes(download_url)
|
|
645
|
+
except Exception as exc:
|
|
646
|
+
logger.warning("Failed to fetch attachment bytes for %s: %s", download_url, exc)
|
|
647
|
+
return None
|
|
648
|
+
|
|
649
|
+
if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
|
|
650
|
+
file_bytes = file_bytes[: self.attachment_max_bytes]
|
|
651
|
+
|
|
652
|
+
mime_type = resolve_mime_type(
|
|
653
|
+
file_bytes,
|
|
654
|
+
declared_mime_type=declared_mime,
|
|
655
|
+
file_name=self._attachment_file_name(asset_id, download_url),
|
|
656
|
+
)
|
|
657
|
+
return file_bytes, mime_type
|
|
658
|
+
|
|
659
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
660
|
+
direct = self._asset_content_cache.get(asset_id)
|
|
661
|
+
if direct:
|
|
662
|
+
return direct
|
|
663
|
+
|
|
664
|
+
if asset_id in self._page_content_cache:
|
|
665
|
+
return self._page_content_cache[asset_id]
|
|
666
|
+
|
|
667
|
+
normalized = normalize_http_url(asset_id, base_url=self.base_url)
|
|
668
|
+
if normalized:
|
|
669
|
+
asset_hash = self.generate_hash_id(normalized)
|
|
670
|
+
if asset_hash in self._page_content_cache:
|
|
671
|
+
return self._page_content_cache[asset_hash]
|
|
672
|
+
if asset_hash in self._asset_content_cache:
|
|
673
|
+
return self._asset_content_cache[asset_hash]
|
|
674
|
+
asset_id = asset_hash
|
|
675
|
+
|
|
676
|
+
download_url = self._attachment_download_url_by_hash.get(asset_id)
|
|
677
|
+
if not download_url:
|
|
678
|
+
mapped = self._hash_to_url.get(asset_id)
|
|
679
|
+
if mapped:
|
|
680
|
+
download_url = mapped
|
|
681
|
+
if not download_url:
|
|
682
|
+
return None
|
|
683
|
+
|
|
684
|
+
try:
|
|
685
|
+
file_bytes, declared_mime = self.client.get_bytes(download_url)
|
|
686
|
+
except Exception as exc:
|
|
687
|
+
logger.warning("Failed to fetch attachment content for %s: %s", download_url, exc)
|
|
688
|
+
return None
|
|
689
|
+
|
|
690
|
+
if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
|
|
691
|
+
file_bytes = file_bytes[: self.attachment_max_bytes]
|
|
692
|
+
|
|
693
|
+
parsed = self.parse_asset_bytes(
|
|
694
|
+
file_bytes,
|
|
695
|
+
declared_mime_type=declared_mime,
|
|
696
|
+
file_name=self._attachment_file_name(asset_id, download_url),
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
if parsed.text_content:
|
|
700
|
+
self._asset_content_cache[asset_id] = (parsed.raw_content, parsed.text_content)
|
|
701
|
+
return parsed.raw_content, parsed.text_content
|
|
702
|
+
return None
|
|
703
|
+
|
|
704
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
705
|
+
normalized = normalize_http_url(asset_id, base_url=self.base_url)
|
|
706
|
+
if not normalized:
|
|
707
|
+
raise ValueError(f"Invalid URL for hash: {asset_id}")
|
|
708
|
+
asset_hash = hash_url(normalized, base_url=self.base_url)
|
|
709
|
+
self._hash_to_url[asset_hash] = normalized
|
|
710
|
+
return asset_hash
|
|
711
|
+
|
|
712
|
+
def resolve_link_for_detection(self, link: str) -> str | None:
|
|
713
|
+
mapped = self._hash_to_url.get(link)
|
|
714
|
+
if mapped:
|
|
715
|
+
return mapped
|
|
716
|
+
return normalize_http_url(link)
|
|
717
|
+
|
|
718
|
+
def enrich_finding_location(
|
|
719
|
+
self,
|
|
720
|
+
finding: DetectionResult,
|
|
721
|
+
asset: SingleAssetScanResults,
|
|
722
|
+
text_content: str,
|
|
723
|
+
) -> None:
|
|
724
|
+
_ = text_content
|
|
725
|
+
finding.location = Location(path=asset.external_url)
|
|
726
|
+
|
|
727
|
+
def abort(self) -> None:
|
|
728
|
+
logger.info("Aborting Confluence extraction...")
|
|
729
|
+
super().abort()
|
|
730
|
+
self.client.close()
|
|
731
|
+
|
|
732
|
+
def cleanup(self) -> None:
|
|
733
|
+
self.client.close()
|