classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,605 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections.abc import AsyncGenerator
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ...models.generated_input import (
|
|
7
|
+
JiraInput,
|
|
8
|
+
JiraOptional,
|
|
9
|
+
JiraOptionalConnection,
|
|
10
|
+
JiraOptionalContent,
|
|
11
|
+
SamplingStrategy,
|
|
12
|
+
)
|
|
13
|
+
from ...models.generated_single_asset_scan_results import (
|
|
14
|
+
AssetType as OutputAssetType,
|
|
15
|
+
)
|
|
16
|
+
from ...models.generated_single_asset_scan_results import (
|
|
17
|
+
DetectionResult,
|
|
18
|
+
Location,
|
|
19
|
+
SingleAssetScanResults,
|
|
20
|
+
)
|
|
21
|
+
from ...utils.file_parser import resolve_mime_type
|
|
22
|
+
from ...utils.hashing import hash_url, normalize_http_url
|
|
23
|
+
from ..atlassian_common import (
|
|
24
|
+
AtlassianCloudClient,
|
|
25
|
+
dedupe_preserve_order,
|
|
26
|
+
deterministic_sample,
|
|
27
|
+
extract_urls_from_text,
|
|
28
|
+
is_tabular_filename,
|
|
29
|
+
is_tabular_mime_type,
|
|
30
|
+
json_dumps,
|
|
31
|
+
normalize_atlassian_base_url,
|
|
32
|
+
parse_atlassian_document,
|
|
33
|
+
parse_datetime,
|
|
34
|
+
)
|
|
35
|
+
from ..base import BaseSource
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class JiraSource(BaseSource):
|
|
41
|
+
source_type = "jira"
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
recipe: dict[str, Any],
|
|
46
|
+
source_id: str | None = None,
|
|
47
|
+
runner_id: str | None = None,
|
|
48
|
+
):
|
|
49
|
+
super().__init__(recipe, source_id=source_id, runner_id=runner_id)
|
|
50
|
+
self.config = JiraInput.model_validate(recipe)
|
|
51
|
+
self.runner_id = runner_id or "local-run"
|
|
52
|
+
|
|
53
|
+
self.base_url = normalize_atlassian_base_url(str(self.config.required.base_url))
|
|
54
|
+
connection = self._connection_options()
|
|
55
|
+
self.client = AtlassianCloudClient(
|
|
56
|
+
base_url=self.base_url,
|
|
57
|
+
account_email=str(self.config.required.account_email),
|
|
58
|
+
api_token=self.config.masked.api_token,
|
|
59
|
+
request_timeout_seconds=float(connection.request_timeout_seconds or 30),
|
|
60
|
+
max_retries=int(connection.max_retries or 3),
|
|
61
|
+
rate_limit_delay_seconds=float(connection.rate_limit_delay_seconds or 0),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
content_options = self._content_options()
|
|
65
|
+
self.include_comments = content_options.include_comments is not False
|
|
66
|
+
self.include_attachments = content_options.include_attachments is not False
|
|
67
|
+
self.attachment_max_bytes = int(content_options.attachment_max_bytes or 5_242_880)
|
|
68
|
+
|
|
69
|
+
self._seen_asset_hashes: set[str] = set()
|
|
70
|
+
self._hash_to_url: dict[str, str] = {}
|
|
71
|
+
self._asset_content_cache: dict[str, tuple[str, str]] = {}
|
|
72
|
+
self._attachment_url_by_hash: dict[str, str] = {}
|
|
73
|
+
|
|
74
|
+
def _optional(self) -> JiraOptional:
|
|
75
|
+
if self.config.optional:
|
|
76
|
+
return self.config.optional
|
|
77
|
+
return JiraOptional()
|
|
78
|
+
|
|
79
|
+
def _connection_options(self) -> JiraOptionalConnection:
|
|
80
|
+
optional = self._optional()
|
|
81
|
+
if optional.connection:
|
|
82
|
+
return optional.connection
|
|
83
|
+
return JiraOptionalConnection()
|
|
84
|
+
|
|
85
|
+
def _content_options(self) -> JiraOptionalContent:
|
|
86
|
+
optional = self._optional()
|
|
87
|
+
if optional.content:
|
|
88
|
+
return optional.content
|
|
89
|
+
return JiraOptionalContent()
|
|
90
|
+
|
|
91
|
+
def test_connection(self) -> dict[str, Any]:
|
|
92
|
+
result = {
|
|
93
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
94
|
+
"source_type": self.recipe.get("type"),
|
|
95
|
+
}
|
|
96
|
+
try:
|
|
97
|
+
self.client.get_json("/rest/api/3/project/search", params={"maxResults": 1})
|
|
98
|
+
result["status"] = "SUCCESS"
|
|
99
|
+
result["message"] = "Successfully connected to Jira Cloud API."
|
|
100
|
+
except Exception as exc:
|
|
101
|
+
result["status"] = "FAILURE"
|
|
102
|
+
result["message"] = f"Failed to connect to Jira Cloud API: {exc}"
|
|
103
|
+
return result
|
|
104
|
+
|
|
105
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
106
|
+
if self._aborted:
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
self._reset_runtime_state()
|
|
110
|
+
|
|
111
|
+
fields = [
|
|
112
|
+
"summary",
|
|
113
|
+
"description",
|
|
114
|
+
"issuetype",
|
|
115
|
+
"status",
|
|
116
|
+
"priority",
|
|
117
|
+
"project",
|
|
118
|
+
"created",
|
|
119
|
+
"updated",
|
|
120
|
+
"issuelinks",
|
|
121
|
+
"attachment",
|
|
122
|
+
"reporter",
|
|
123
|
+
"assignee",
|
|
124
|
+
]
|
|
125
|
+
effective_jql = self._effective_jql()
|
|
126
|
+
issues = self.client.iter_jira_search_jql(
|
|
127
|
+
jql=effective_jql,
|
|
128
|
+
fields=fields,
|
|
129
|
+
max_results=100,
|
|
130
|
+
)
|
|
131
|
+
sampled_issues = self._sample_issues(issues)
|
|
132
|
+
|
|
133
|
+
pending_batch: list[SingleAssetScanResults] = []
|
|
134
|
+
for issue in sampled_issues:
|
|
135
|
+
if self._aborted:
|
|
136
|
+
break
|
|
137
|
+
assets = self._extract_issue_assets(issue)
|
|
138
|
+
for asset in assets:
|
|
139
|
+
if not self._add_asset_if_new(pending_batch, asset):
|
|
140
|
+
continue
|
|
141
|
+
while len(pending_batch) >= self.BATCH_SIZE:
|
|
142
|
+
to_emit = pending_batch[: self.BATCH_SIZE]
|
|
143
|
+
pending_batch = pending_batch[self.BATCH_SIZE :]
|
|
144
|
+
if to_emit:
|
|
145
|
+
yield to_emit
|
|
146
|
+
|
|
147
|
+
if pending_batch:
|
|
148
|
+
yield pending_batch
|
|
149
|
+
|
|
150
|
+
def _reset_runtime_state(self) -> None:
|
|
151
|
+
self._seen_asset_hashes = set()
|
|
152
|
+
self._hash_to_url = {}
|
|
153
|
+
self._asset_content_cache = {}
|
|
154
|
+
self._attachment_url_by_hash = {}
|
|
155
|
+
self._attachment_name_by_hash = {}
|
|
156
|
+
|
|
157
|
+
def _effective_jql(self) -> str:
|
|
158
|
+
scope = self._optional().scope
|
|
159
|
+
project_keys = [
|
|
160
|
+
str(v).strip() for v in (getattr(scope, "project_keys", None) or []) if str(v).strip()
|
|
161
|
+
]
|
|
162
|
+
project_ids = [
|
|
163
|
+
str(v).strip() for v in (getattr(scope, "project_ids", None) or []) if str(v).strip()
|
|
164
|
+
]
|
|
165
|
+
scope_jql = str(getattr(scope, "jql", "") or "").strip()
|
|
166
|
+
|
|
167
|
+
order_by = ""
|
|
168
|
+
if scope_jql:
|
|
169
|
+
idx = scope_jql.lower().find(" order by ")
|
|
170
|
+
if idx >= 0:
|
|
171
|
+
order_by = scope_jql[idx:].strip()
|
|
172
|
+
scope_jql = scope_jql[:idx].strip()
|
|
173
|
+
|
|
174
|
+
project_clauses: list[str] = []
|
|
175
|
+
if project_keys:
|
|
176
|
+
project_clauses.append(f"project in ({', '.join(project_keys)})")
|
|
177
|
+
if project_ids:
|
|
178
|
+
project_clauses.append(f"project in ({', '.join(project_ids)})")
|
|
179
|
+
project_scope = " OR ".join(project_clauses)
|
|
180
|
+
if project_scope:
|
|
181
|
+
project_scope = f"({project_scope})"
|
|
182
|
+
|
|
183
|
+
clauses = [clause for clause in [scope_jql, project_scope] if clause]
|
|
184
|
+
if not clauses:
|
|
185
|
+
base_query = "issuekey IS NOT EMPTY"
|
|
186
|
+
if self.config.sampling.strategy == SamplingStrategy.LATEST:
|
|
187
|
+
return f"{base_query} ORDER BY updated DESC"
|
|
188
|
+
return base_query
|
|
189
|
+
|
|
190
|
+
query = " AND ".join(f"({clause})" for clause in clauses)
|
|
191
|
+
if order_by:
|
|
192
|
+
return f"{query} {order_by}"
|
|
193
|
+
if self.config.sampling.strategy == SamplingStrategy.LATEST:
|
|
194
|
+
return f"{query} ORDER BY updated DESC"
|
|
195
|
+
return query
|
|
196
|
+
|
|
197
|
+
def _sample_issues(self, issues: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
198
|
+
sampling = self.config.sampling
|
|
199
|
+
if sampling.strategy == SamplingStrategy.ALL:
|
|
200
|
+
return issues
|
|
201
|
+
|
|
202
|
+
limit = int(sampling.rows_per_page or 100)
|
|
203
|
+
if limit >= len(issues):
|
|
204
|
+
return issues
|
|
205
|
+
|
|
206
|
+
if sampling.strategy == SamplingStrategy.RANDOM:
|
|
207
|
+
return deterministic_sample(issues, limit)
|
|
208
|
+
|
|
209
|
+
sorted_issues = sorted(
|
|
210
|
+
issues,
|
|
211
|
+
key=lambda issue: parse_datetime(
|
|
212
|
+
str(
|
|
213
|
+
issue.get("fields", {}).get("updated")
|
|
214
|
+
if isinstance(issue.get("fields"), dict)
|
|
215
|
+
else ""
|
|
216
|
+
)
|
|
217
|
+
),
|
|
218
|
+
reverse=True,
|
|
219
|
+
)
|
|
220
|
+
return sorted_issues[:limit]
|
|
221
|
+
|
|
222
|
+
def _extract_issue_assets(self, issue: dict[str, Any]) -> list[SingleAssetScanResults]:
|
|
223
|
+
fields = issue.get("fields", {})
|
|
224
|
+
if not isinstance(fields, dict):
|
|
225
|
+
fields = {}
|
|
226
|
+
|
|
227
|
+
now = datetime.now(UTC)
|
|
228
|
+
issue_key = str(issue.get("key") or issue.get("id") or "")
|
|
229
|
+
if not issue_key:
|
|
230
|
+
return []
|
|
231
|
+
|
|
232
|
+
issue_url = f"{self.base_url}/browse/{issue_key}"
|
|
233
|
+
issue_hash = self.generate_hash_id(issue_url)
|
|
234
|
+
summary = str(fields.get("summary") or issue_key)
|
|
235
|
+
|
|
236
|
+
description_text, description_urls = self._text_and_urls_from_adf(fields.get("description"))
|
|
237
|
+
comment_asset, comment_hashes, comment_urls = self._comments_asset(
|
|
238
|
+
issue_key, issue_url, now
|
|
239
|
+
)
|
|
240
|
+
attachment_assets, attachment_hashes = self._attachment_assets(issue, issue_hash, now)
|
|
241
|
+
linked_issue_hashes = self._linked_issue_hashes(fields.get("issuelinks"))
|
|
242
|
+
|
|
243
|
+
all_url_hashes = [
|
|
244
|
+
self.generate_hash_id(url)
|
|
245
|
+
for url in (
|
|
246
|
+
normalize_http_url(value, base_url=self.base_url)
|
|
247
|
+
for value in [*description_urls, *comment_urls]
|
|
248
|
+
)
|
|
249
|
+
if url
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
issue_links = dedupe_preserve_order(
|
|
253
|
+
[*linked_issue_hashes, *attachment_hashes, *comment_hashes, *all_url_hashes]
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
issue_metadata = {
|
|
257
|
+
"issue_key": issue_key,
|
|
258
|
+
"summary": summary,
|
|
259
|
+
"status": fields.get("status"),
|
|
260
|
+
"priority": fields.get("priority"),
|
|
261
|
+
"project": fields.get("project"),
|
|
262
|
+
"updated": fields.get("updated"),
|
|
263
|
+
"attachments_count": len(attachment_hashes),
|
|
264
|
+
"links_count": len(issue_links),
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
issue_text_lines = [
|
|
268
|
+
f"key={issue_key}",
|
|
269
|
+
f"summary={summary}",
|
|
270
|
+
f"status={self._value_name(fields.get('status'))}",
|
|
271
|
+
f"issue_type={self._value_name(fields.get('issuetype'))}",
|
|
272
|
+
f"priority={self._value_name(fields.get('priority'))}",
|
|
273
|
+
"",
|
|
274
|
+
description_text,
|
|
275
|
+
]
|
|
276
|
+
issue_text = "\n".join(line for line in issue_text_lines if line).strip()
|
|
277
|
+
self._asset_content_cache[issue_hash] = (json_dumps(issue_metadata), issue_text)
|
|
278
|
+
|
|
279
|
+
issue_asset = SingleAssetScanResults(
|
|
280
|
+
hash=issue_hash,
|
|
281
|
+
checksum=self.calculate_checksum(issue_metadata),
|
|
282
|
+
name=f"{issue_key}: {summary}",
|
|
283
|
+
external_url=issue_url,
|
|
284
|
+
links=issue_links,
|
|
285
|
+
asset_type=OutputAssetType.TXT,
|
|
286
|
+
source_id=self.source_id,
|
|
287
|
+
created_at=parse_datetime(str(fields.get("created") or "")),
|
|
288
|
+
updated_at=parse_datetime(str(fields.get("updated") or "")),
|
|
289
|
+
runner_id=self.runner_id,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
assets: list[SingleAssetScanResults] = [issue_asset]
|
|
293
|
+
if comment_asset:
|
|
294
|
+
assets.append(comment_asset)
|
|
295
|
+
assets.extend(attachment_assets)
|
|
296
|
+
return assets
|
|
297
|
+
|
|
298
|
+
def _comments_asset(
|
|
299
|
+
self,
|
|
300
|
+
issue_key: str,
|
|
301
|
+
issue_url: str,
|
|
302
|
+
now: datetime,
|
|
303
|
+
) -> tuple[SingleAssetScanResults | None, list[str], list[str]]:
|
|
304
|
+
if not self.include_comments:
|
|
305
|
+
return None, [], []
|
|
306
|
+
|
|
307
|
+
comments = self._fetch_issue_comments(issue_key)
|
|
308
|
+
if not comments:
|
|
309
|
+
return None, [], []
|
|
310
|
+
|
|
311
|
+
text_blocks: list[str] = []
|
|
312
|
+
urls: list[str] = []
|
|
313
|
+
for comment in comments:
|
|
314
|
+
body = comment.get("body")
|
|
315
|
+
text, body_urls = self._text_and_urls_from_adf(body)
|
|
316
|
+
if text:
|
|
317
|
+
text_blocks.append(text)
|
|
318
|
+
urls.extend(body_urls)
|
|
319
|
+
|
|
320
|
+
combined_text = "\n\n".join(block for block in text_blocks if block).strip()
|
|
321
|
+
if not combined_text:
|
|
322
|
+
return None, [], urls
|
|
323
|
+
|
|
324
|
+
# Keep a distinct, URL-stable comments asset identifier (fragments are stripped in URL normalization).
|
|
325
|
+
comments_url = f"{issue_url}?view=comments"
|
|
326
|
+
comments_hash = self.generate_hash_id(comments_url)
|
|
327
|
+
|
|
328
|
+
comment_link_hashes = [
|
|
329
|
+
self.generate_hash_id(normalized)
|
|
330
|
+
for normalized in (
|
|
331
|
+
normalize_http_url(url, base_url=self.base_url)
|
|
332
|
+
for url in dedupe_preserve_order(urls)
|
|
333
|
+
)
|
|
334
|
+
if normalized
|
|
335
|
+
]
|
|
336
|
+
self._asset_content_cache[comments_hash] = (combined_text, combined_text)
|
|
337
|
+
|
|
338
|
+
asset = SingleAssetScanResults(
|
|
339
|
+
hash=comments_hash,
|
|
340
|
+
checksum=self.calculate_checksum(
|
|
341
|
+
{
|
|
342
|
+
"issue_key": issue_key,
|
|
343
|
+
"comments_count": len(comments),
|
|
344
|
+
"text_length": len(combined_text),
|
|
345
|
+
}
|
|
346
|
+
),
|
|
347
|
+
name=f"Comments for issue {issue_key}",
|
|
348
|
+
external_url=comments_url,
|
|
349
|
+
links=comment_link_hashes,
|
|
350
|
+
asset_type=OutputAssetType.TXT,
|
|
351
|
+
source_id=self.source_id,
|
|
352
|
+
created_at=now,
|
|
353
|
+
updated_at=now,
|
|
354
|
+
runner_id=self.runner_id,
|
|
355
|
+
)
|
|
356
|
+
return asset, [comments_hash], urls
|
|
357
|
+
|
|
358
|
+
def _fetch_issue_comments(self, issue_key: str) -> list[dict[str, Any]]:
|
|
359
|
+
all_comments: list[dict[str, Any]] = []
|
|
360
|
+
start_at = 0
|
|
361
|
+
while True:
|
|
362
|
+
payload = self.client.get_json(
|
|
363
|
+
f"/rest/api/3/issue/{issue_key}/comment",
|
|
364
|
+
params={"startAt": start_at, "maxResults": 100, "orderBy": "created"},
|
|
365
|
+
)
|
|
366
|
+
comments = payload.get("comments", [])
|
|
367
|
+
if isinstance(comments, list):
|
|
368
|
+
all_comments.extend([comment for comment in comments if isinstance(comment, dict)])
|
|
369
|
+
|
|
370
|
+
max_results = int(payload.get("maxResults") or 0)
|
|
371
|
+
total = int(payload.get("total") or len(all_comments))
|
|
372
|
+
start_at += max_results if max_results > 0 else len(comments)
|
|
373
|
+
if start_at >= total or not comments:
|
|
374
|
+
break
|
|
375
|
+
return all_comments
|
|
376
|
+
|
|
377
|
+
def _attachment_assets(
|
|
378
|
+
self,
|
|
379
|
+
issue: dict[str, Any],
|
|
380
|
+
issue_hash: str,
|
|
381
|
+
now: datetime,
|
|
382
|
+
) -> tuple[list[SingleAssetScanResults], list[str]]:
|
|
383
|
+
if not self.include_attachments:
|
|
384
|
+
return [], []
|
|
385
|
+
fields = issue.get("fields", {})
|
|
386
|
+
if not isinstance(fields, dict):
|
|
387
|
+
return [], []
|
|
388
|
+
|
|
389
|
+
assets: list[SingleAssetScanResults] = []
|
|
390
|
+
hashes: list[str] = []
|
|
391
|
+
attachments = fields.get("attachment")
|
|
392
|
+
if not isinstance(attachments, list):
|
|
393
|
+
return assets, hashes
|
|
394
|
+
|
|
395
|
+
for attachment in attachments:
|
|
396
|
+
if not isinstance(attachment, dict):
|
|
397
|
+
continue
|
|
398
|
+
content_url = attachment.get("content")
|
|
399
|
+
if not isinstance(content_url, str):
|
|
400
|
+
continue
|
|
401
|
+
normalized_url = normalize_http_url(content_url, base_url=self.base_url)
|
|
402
|
+
if not normalized_url:
|
|
403
|
+
continue
|
|
404
|
+
|
|
405
|
+
attachment_hash = self.generate_hash_id(normalized_url)
|
|
406
|
+
self._attachment_url_by_hash[attachment_hash] = normalized_url
|
|
407
|
+
mime = str(attachment.get("mimeType") or "").lower()
|
|
408
|
+
filename = str(attachment.get("filename") or f"attachment-{attachment.get('id')}")
|
|
409
|
+
self._attachment_name_by_hash[attachment_hash] = filename
|
|
410
|
+
metadata = {
|
|
411
|
+
"issue_hash": issue_hash,
|
|
412
|
+
"attachment_id": attachment.get("id"),
|
|
413
|
+
"mime_type": mime,
|
|
414
|
+
"size": attachment.get("size"),
|
|
415
|
+
"filename": filename,
|
|
416
|
+
}
|
|
417
|
+
assets.append(
|
|
418
|
+
SingleAssetScanResults(
|
|
419
|
+
hash=attachment_hash,
|
|
420
|
+
checksum=self.calculate_checksum(metadata),
|
|
421
|
+
name=filename,
|
|
422
|
+
external_url=normalized_url,
|
|
423
|
+
links=[],
|
|
424
|
+
asset_type=self._asset_type_from_mime_or_name(mime, filename),
|
|
425
|
+
source_id=self.source_id,
|
|
426
|
+
created_at=now,
|
|
427
|
+
updated_at=now,
|
|
428
|
+
runner_id=self.runner_id,
|
|
429
|
+
)
|
|
430
|
+
)
|
|
431
|
+
hashes.append(attachment_hash)
|
|
432
|
+
return assets, hashes
|
|
433
|
+
|
|
434
|
+
def _linked_issue_hashes(self, links: Any) -> list[str]:
|
|
435
|
+
if not isinstance(links, list):
|
|
436
|
+
return []
|
|
437
|
+
hashes: list[str] = []
|
|
438
|
+
for link in links:
|
|
439
|
+
if not isinstance(link, dict):
|
|
440
|
+
continue
|
|
441
|
+
for side in ("inwardIssue", "outwardIssue"):
|
|
442
|
+
issue_obj = link.get(side)
|
|
443
|
+
if not isinstance(issue_obj, dict):
|
|
444
|
+
continue
|
|
445
|
+
issue_key = issue_obj.get("key")
|
|
446
|
+
if not isinstance(issue_key, str):
|
|
447
|
+
continue
|
|
448
|
+
linked_url = f"{self.base_url}/browse/{issue_key}"
|
|
449
|
+
hashes.append(self.generate_hash_id(linked_url))
|
|
450
|
+
return dedupe_preserve_order(hashes)
|
|
451
|
+
|
|
452
|
+
def _text_and_urls_from_adf(self, value: Any) -> tuple[str, list[str]]:
|
|
453
|
+
if isinstance(value, str):
|
|
454
|
+
return value, extract_urls_from_text(value)
|
|
455
|
+
text, urls = parse_atlassian_document(value)
|
|
456
|
+
return text, dedupe_preserve_order(urls + extract_urls_from_text(text))
|
|
457
|
+
|
|
458
|
+
def _value_name(self, value: Any) -> str:
|
|
459
|
+
if isinstance(value, dict):
|
|
460
|
+
name = value.get("name")
|
|
461
|
+
if isinstance(name, str):
|
|
462
|
+
return name
|
|
463
|
+
return str(value or "")
|
|
464
|
+
|
|
465
|
+
def _asset_type_from_mime_or_name(
|
|
466
|
+
self,
|
|
467
|
+
mime_type: str,
|
|
468
|
+
file_name: str,
|
|
469
|
+
) -> OutputAssetType:
|
|
470
|
+
mime_asset_type = self._asset_type_from_mime(mime_type)
|
|
471
|
+
if mime_asset_type != OutputAssetType.BINARY:
|
|
472
|
+
return mime_asset_type
|
|
473
|
+
if is_tabular_filename(file_name):
|
|
474
|
+
return OutputAssetType.TABLE
|
|
475
|
+
return mime_asset_type
|
|
476
|
+
|
|
477
|
+
def _asset_type_from_mime(self, mime_type: str) -> OutputAssetType:
|
|
478
|
+
normalized = mime_type.lower()
|
|
479
|
+
if normalized.startswith("image/"):
|
|
480
|
+
return OutputAssetType.IMAGE
|
|
481
|
+
if normalized.startswith("video/"):
|
|
482
|
+
return OutputAssetType.VIDEO
|
|
483
|
+
if normalized.startswith("audio/"):
|
|
484
|
+
return OutputAssetType.AUDIO
|
|
485
|
+
if is_tabular_mime_type(normalized):
|
|
486
|
+
return OutputAssetType.TABLE
|
|
487
|
+
if normalized in {
|
|
488
|
+
"text/plain",
|
|
489
|
+
"application/json",
|
|
490
|
+
"application/xml",
|
|
491
|
+
"text/xml",
|
|
492
|
+
}:
|
|
493
|
+
return OutputAssetType.TXT
|
|
494
|
+
if normalized == "text/html":
|
|
495
|
+
return OutputAssetType.URL
|
|
496
|
+
return OutputAssetType.BINARY
|
|
497
|
+
|
|
498
|
+
def _add_asset_if_new(
|
|
499
|
+
self,
|
|
500
|
+
assets: list[SingleAssetScanResults],
|
|
501
|
+
asset: SingleAssetScanResults,
|
|
502
|
+
) -> bool:
|
|
503
|
+
if asset.hash in self._seen_asset_hashes:
|
|
504
|
+
return False
|
|
505
|
+
self._seen_asset_hashes.add(asset.hash)
|
|
506
|
+
assets.append(asset)
|
|
507
|
+
return True
|
|
508
|
+
|
|
509
|
+
async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
|
|
510
|
+
normalized = normalize_http_url(asset_id, base_url=self.base_url)
|
|
511
|
+
if normalized:
|
|
512
|
+
asset_hash = self.generate_hash_id(normalized)
|
|
513
|
+
asset_id = asset_hash
|
|
514
|
+
|
|
515
|
+
attachment_url = self._attachment_url_by_hash.get(asset_id) or self._hash_to_url.get(
|
|
516
|
+
asset_id
|
|
517
|
+
)
|
|
518
|
+
if not attachment_url:
|
|
519
|
+
return None
|
|
520
|
+
|
|
521
|
+
try:
|
|
522
|
+
file_bytes, declared_mime = self.client.get_bytes(attachment_url)
|
|
523
|
+
except Exception as exc:
|
|
524
|
+
logger.warning("Failed to fetch Jira attachment bytes for %s: %s", attachment_url, exc)
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
|
|
528
|
+
file_bytes = file_bytes[: self.attachment_max_bytes]
|
|
529
|
+
|
|
530
|
+
mime_type = resolve_mime_type(
|
|
531
|
+
file_bytes,
|
|
532
|
+
declared_mime_type=declared_mime,
|
|
533
|
+
file_name=self._attachment_file_name(asset_id, attachment_url),
|
|
534
|
+
)
|
|
535
|
+
return file_bytes, mime_type
|
|
536
|
+
|
|
537
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
538
|
+
cached = self._asset_content_cache.get(asset_id)
|
|
539
|
+
if cached:
|
|
540
|
+
return cached
|
|
541
|
+
|
|
542
|
+
normalized = normalize_http_url(asset_id, base_url=self.base_url)
|
|
543
|
+
if normalized:
|
|
544
|
+
asset_hash = self.generate_hash_id(normalized)
|
|
545
|
+
cached = self._asset_content_cache.get(asset_hash)
|
|
546
|
+
if cached:
|
|
547
|
+
return cached
|
|
548
|
+
asset_id = asset_hash
|
|
549
|
+
|
|
550
|
+
attachment_url = self._attachment_url_by_hash.get(asset_id) or self._hash_to_url.get(
|
|
551
|
+
asset_id
|
|
552
|
+
)
|
|
553
|
+
if not attachment_url:
|
|
554
|
+
return None
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
file_bytes, declared_mime = self.client.get_bytes(attachment_url)
|
|
558
|
+
except Exception as exc:
|
|
559
|
+
logger.warning("Failed to fetch Jira attachment %s: %s", attachment_url, exc)
|
|
560
|
+
return None
|
|
561
|
+
|
|
562
|
+
if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
|
|
563
|
+
file_bytes = file_bytes[: self.attachment_max_bytes]
|
|
564
|
+
|
|
565
|
+
parsed = self.parse_asset_bytes(
|
|
566
|
+
file_bytes,
|
|
567
|
+
declared_mime_type=declared_mime,
|
|
568
|
+
file_name=self._attachment_file_name(asset_id, attachment_url),
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
if parsed.text_content:
|
|
572
|
+
self._asset_content_cache[asset_id] = (parsed.raw_content, parsed.text_content)
|
|
573
|
+
return parsed.raw_content, parsed.text_content
|
|
574
|
+
return None
|
|
575
|
+
|
|
576
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
577
|
+
normalized = normalize_http_url(asset_id, base_url=self.base_url)
|
|
578
|
+
if not normalized:
|
|
579
|
+
raise ValueError(f"Invalid URL for hash: {asset_id}")
|
|
580
|
+
asset_hash = hash_url(normalized, base_url=self.base_url)
|
|
581
|
+
self._hash_to_url[asset_hash] = normalized
|
|
582
|
+
return asset_hash
|
|
583
|
+
|
|
584
|
+
def resolve_link_for_detection(self, link: str) -> str | None:
|
|
585
|
+
mapped = self._hash_to_url.get(link)
|
|
586
|
+
if mapped:
|
|
587
|
+
return mapped
|
|
588
|
+
return normalize_http_url(link)
|
|
589
|
+
|
|
590
|
+
def enrich_finding_location(
|
|
591
|
+
self,
|
|
592
|
+
finding: DetectionResult,
|
|
593
|
+
asset: SingleAssetScanResults,
|
|
594
|
+
text_content: str,
|
|
595
|
+
) -> None:
|
|
596
|
+
_ = text_content
|
|
597
|
+
finding.location = Location(path=asset.external_url)
|
|
598
|
+
|
|
599
|
+
def abort(self) -> None:
|
|
600
|
+
logger.info("Aborting Jira extraction...")
|
|
601
|
+
super().abort()
|
|
602
|
+
self.client.close()
|
|
603
|
+
|
|
604
|
+
def cleanup(self) -> None:
|
|
605
|
+
self.client.close()
|