classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,3 @@
1
+ from .source import JiraSource
2
+
3
+ __all__ = ["JiraSource"]
@@ -0,0 +1,605 @@
1
+ import logging
2
+ from collections.abc import AsyncGenerator
3
+ from datetime import UTC, datetime
4
+ from typing import Any
5
+
6
+ from ...models.generated_input import (
7
+ JiraInput,
8
+ JiraOptional,
9
+ JiraOptionalConnection,
10
+ JiraOptionalContent,
11
+ SamplingStrategy,
12
+ )
13
+ from ...models.generated_single_asset_scan_results import (
14
+ AssetType as OutputAssetType,
15
+ )
16
+ from ...models.generated_single_asset_scan_results import (
17
+ DetectionResult,
18
+ Location,
19
+ SingleAssetScanResults,
20
+ )
21
+ from ...utils.file_parser import resolve_mime_type
22
+ from ...utils.hashing import hash_url, normalize_http_url
23
+ from ..atlassian_common import (
24
+ AtlassianCloudClient,
25
+ dedupe_preserve_order,
26
+ deterministic_sample,
27
+ extract_urls_from_text,
28
+ is_tabular_filename,
29
+ is_tabular_mime_type,
30
+ json_dumps,
31
+ normalize_atlassian_base_url,
32
+ parse_atlassian_document,
33
+ parse_datetime,
34
+ )
35
+ from ..base import BaseSource
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class JiraSource(BaseSource):
41
+ source_type = "jira"
42
+
43
+ def __init__(
44
+ self,
45
+ recipe: dict[str, Any],
46
+ source_id: str | None = None,
47
+ runner_id: str | None = None,
48
+ ):
49
+ super().__init__(recipe, source_id=source_id, runner_id=runner_id)
50
+ self.config = JiraInput.model_validate(recipe)
51
+ self.runner_id = runner_id or "local-run"
52
+
53
+ self.base_url = normalize_atlassian_base_url(str(self.config.required.base_url))
54
+ connection = self._connection_options()
55
+ self.client = AtlassianCloudClient(
56
+ base_url=self.base_url,
57
+ account_email=str(self.config.required.account_email),
58
+ api_token=self.config.masked.api_token,
59
+ request_timeout_seconds=float(connection.request_timeout_seconds or 30),
60
+ max_retries=int(connection.max_retries or 3),
61
+ rate_limit_delay_seconds=float(connection.rate_limit_delay_seconds or 0),
62
+ )
63
+
64
+ content_options = self._content_options()
65
+ self.include_comments = content_options.include_comments is not False
66
+ self.include_attachments = content_options.include_attachments is not False
67
+ self.attachment_max_bytes = int(content_options.attachment_max_bytes or 5_242_880)
68
+
69
+ self._seen_asset_hashes: set[str] = set()
70
+ self._hash_to_url: dict[str, str] = {}
71
+ self._asset_content_cache: dict[str, tuple[str, str]] = {}
72
+ self._attachment_url_by_hash: dict[str, str] = {}
73
+
74
+ def _optional(self) -> JiraOptional:
75
+ if self.config.optional:
76
+ return self.config.optional
77
+ return JiraOptional()
78
+
79
+ def _connection_options(self) -> JiraOptionalConnection:
80
+ optional = self._optional()
81
+ if optional.connection:
82
+ return optional.connection
83
+ return JiraOptionalConnection()
84
+
85
+ def _content_options(self) -> JiraOptionalContent:
86
+ optional = self._optional()
87
+ if optional.content:
88
+ return optional.content
89
+ return JiraOptionalContent()
90
+
91
+ def test_connection(self) -> dict[str, Any]:
92
+ result = {
93
+ "timestamp": datetime.now(UTC).isoformat(),
94
+ "source_type": self.recipe.get("type"),
95
+ }
96
+ try:
97
+ self.client.get_json("/rest/api/3/project/search", params={"maxResults": 1})
98
+ result["status"] = "SUCCESS"
99
+ result["message"] = "Successfully connected to Jira Cloud API."
100
+ except Exception as exc:
101
+ result["status"] = "FAILURE"
102
+ result["message"] = f"Failed to connect to Jira Cloud API: {exc}"
103
+ return result
104
+
105
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
106
+ if self._aborted:
107
+ return
108
+
109
+ self._reset_runtime_state()
110
+
111
+ fields = [
112
+ "summary",
113
+ "description",
114
+ "issuetype",
115
+ "status",
116
+ "priority",
117
+ "project",
118
+ "created",
119
+ "updated",
120
+ "issuelinks",
121
+ "attachment",
122
+ "reporter",
123
+ "assignee",
124
+ ]
125
+ effective_jql = self._effective_jql()
126
+ issues = self.client.iter_jira_search_jql(
127
+ jql=effective_jql,
128
+ fields=fields,
129
+ max_results=100,
130
+ )
131
+ sampled_issues = self._sample_issues(issues)
132
+
133
+ pending_batch: list[SingleAssetScanResults] = []
134
+ for issue in sampled_issues:
135
+ if self._aborted:
136
+ break
137
+ assets = self._extract_issue_assets(issue)
138
+ for asset in assets:
139
+ if not self._add_asset_if_new(pending_batch, asset):
140
+ continue
141
+ while len(pending_batch) >= self.BATCH_SIZE:
142
+ to_emit = pending_batch[: self.BATCH_SIZE]
143
+ pending_batch = pending_batch[self.BATCH_SIZE :]
144
+ if to_emit:
145
+ yield to_emit
146
+
147
+ if pending_batch:
148
+ yield pending_batch
149
+
150
+ def _reset_runtime_state(self) -> None:
151
+ self._seen_asset_hashes = set()
152
+ self._hash_to_url = {}
153
+ self._asset_content_cache = {}
154
+ self._attachment_url_by_hash = {}
155
+ self._attachment_name_by_hash = {}
156
+
157
+ def _effective_jql(self) -> str:
158
+ scope = self._optional().scope
159
+ project_keys = [
160
+ str(v).strip() for v in (getattr(scope, "project_keys", None) or []) if str(v).strip()
161
+ ]
162
+ project_ids = [
163
+ str(v).strip() for v in (getattr(scope, "project_ids", None) or []) if str(v).strip()
164
+ ]
165
+ scope_jql = str(getattr(scope, "jql", "") or "").strip()
166
+
167
+ order_by = ""
168
+ if scope_jql:
169
+ idx = scope_jql.lower().find(" order by ")
170
+ if idx >= 0:
171
+ order_by = scope_jql[idx:].strip()
172
+ scope_jql = scope_jql[:idx].strip()
173
+
174
+ project_clauses: list[str] = []
175
+ if project_keys:
176
+ project_clauses.append(f"project in ({', '.join(project_keys)})")
177
+ if project_ids:
178
+ project_clauses.append(f"project in ({', '.join(project_ids)})")
179
+ project_scope = " OR ".join(project_clauses)
180
+ if project_scope:
181
+ project_scope = f"({project_scope})"
182
+
183
+ clauses = [clause for clause in [scope_jql, project_scope] if clause]
184
+ if not clauses:
185
+ base_query = "issuekey IS NOT EMPTY"
186
+ if self.config.sampling.strategy == SamplingStrategy.LATEST:
187
+ return f"{base_query} ORDER BY updated DESC"
188
+ return base_query
189
+
190
+ query = " AND ".join(f"({clause})" for clause in clauses)
191
+ if order_by:
192
+ return f"{query} {order_by}"
193
+ if self.config.sampling.strategy == SamplingStrategy.LATEST:
194
+ return f"{query} ORDER BY updated DESC"
195
+ return query
196
+
197
+ def _sample_issues(self, issues: list[dict[str, Any]]) -> list[dict[str, Any]]:
198
+ sampling = self.config.sampling
199
+ if sampling.strategy == SamplingStrategy.ALL:
200
+ return issues
201
+
202
+ limit = int(sampling.rows_per_page or 100)
203
+ if limit >= len(issues):
204
+ return issues
205
+
206
+ if sampling.strategy == SamplingStrategy.RANDOM:
207
+ return deterministic_sample(issues, limit)
208
+
209
+ sorted_issues = sorted(
210
+ issues,
211
+ key=lambda issue: parse_datetime(
212
+ str(
213
+ issue.get("fields", {}).get("updated")
214
+ if isinstance(issue.get("fields"), dict)
215
+ else ""
216
+ )
217
+ ),
218
+ reverse=True,
219
+ )
220
+ return sorted_issues[:limit]
221
+
222
+ def _extract_issue_assets(self, issue: dict[str, Any]) -> list[SingleAssetScanResults]:
223
+ fields = issue.get("fields", {})
224
+ if not isinstance(fields, dict):
225
+ fields = {}
226
+
227
+ now = datetime.now(UTC)
228
+ issue_key = str(issue.get("key") or issue.get("id") or "")
229
+ if not issue_key:
230
+ return []
231
+
232
+ issue_url = f"{self.base_url}/browse/{issue_key}"
233
+ issue_hash = self.generate_hash_id(issue_url)
234
+ summary = str(fields.get("summary") or issue_key)
235
+
236
+ description_text, description_urls = self._text_and_urls_from_adf(fields.get("description"))
237
+ comment_asset, comment_hashes, comment_urls = self._comments_asset(
238
+ issue_key, issue_url, now
239
+ )
240
+ attachment_assets, attachment_hashes = self._attachment_assets(issue, issue_hash, now)
241
+ linked_issue_hashes = self._linked_issue_hashes(fields.get("issuelinks"))
242
+
243
+ all_url_hashes = [
244
+ self.generate_hash_id(url)
245
+ for url in (
246
+ normalize_http_url(value, base_url=self.base_url)
247
+ for value in [*description_urls, *comment_urls]
248
+ )
249
+ if url
250
+ ]
251
+
252
+ issue_links = dedupe_preserve_order(
253
+ [*linked_issue_hashes, *attachment_hashes, *comment_hashes, *all_url_hashes]
254
+ )
255
+
256
+ issue_metadata = {
257
+ "issue_key": issue_key,
258
+ "summary": summary,
259
+ "status": fields.get("status"),
260
+ "priority": fields.get("priority"),
261
+ "project": fields.get("project"),
262
+ "updated": fields.get("updated"),
263
+ "attachments_count": len(attachment_hashes),
264
+ "links_count": len(issue_links),
265
+ }
266
+
267
+ issue_text_lines = [
268
+ f"key={issue_key}",
269
+ f"summary={summary}",
270
+ f"status={self._value_name(fields.get('status'))}",
271
+ f"issue_type={self._value_name(fields.get('issuetype'))}",
272
+ f"priority={self._value_name(fields.get('priority'))}",
273
+ "",
274
+ description_text,
275
+ ]
276
+ issue_text = "\n".join(line for line in issue_text_lines if line).strip()
277
+ self._asset_content_cache[issue_hash] = (json_dumps(issue_metadata), issue_text)
278
+
279
+ issue_asset = SingleAssetScanResults(
280
+ hash=issue_hash,
281
+ checksum=self.calculate_checksum(issue_metadata),
282
+ name=f"{issue_key}: {summary}",
283
+ external_url=issue_url,
284
+ links=issue_links,
285
+ asset_type=OutputAssetType.TXT,
286
+ source_id=self.source_id,
287
+ created_at=parse_datetime(str(fields.get("created") or "")),
288
+ updated_at=parse_datetime(str(fields.get("updated") or "")),
289
+ runner_id=self.runner_id,
290
+ )
291
+
292
+ assets: list[SingleAssetScanResults] = [issue_asset]
293
+ if comment_asset:
294
+ assets.append(comment_asset)
295
+ assets.extend(attachment_assets)
296
+ return assets
297
+
298
+ def _comments_asset(
299
+ self,
300
+ issue_key: str,
301
+ issue_url: str,
302
+ now: datetime,
303
+ ) -> tuple[SingleAssetScanResults | None, list[str], list[str]]:
304
+ if not self.include_comments:
305
+ return None, [], []
306
+
307
+ comments = self._fetch_issue_comments(issue_key)
308
+ if not comments:
309
+ return None, [], []
310
+
311
+ text_blocks: list[str] = []
312
+ urls: list[str] = []
313
+ for comment in comments:
314
+ body = comment.get("body")
315
+ text, body_urls = self._text_and_urls_from_adf(body)
316
+ if text:
317
+ text_blocks.append(text)
318
+ urls.extend(body_urls)
319
+
320
+ combined_text = "\n\n".join(block for block in text_blocks if block).strip()
321
+ if not combined_text:
322
+ return None, [], urls
323
+
324
+ # Keep a distinct, URL-stable comments asset identifier (fragments are stripped in URL normalization).
325
+ comments_url = f"{issue_url}?view=comments"
326
+ comments_hash = self.generate_hash_id(comments_url)
327
+
328
+ comment_link_hashes = [
329
+ self.generate_hash_id(normalized)
330
+ for normalized in (
331
+ normalize_http_url(url, base_url=self.base_url)
332
+ for url in dedupe_preserve_order(urls)
333
+ )
334
+ if normalized
335
+ ]
336
+ self._asset_content_cache[comments_hash] = (combined_text, combined_text)
337
+
338
+ asset = SingleAssetScanResults(
339
+ hash=comments_hash,
340
+ checksum=self.calculate_checksum(
341
+ {
342
+ "issue_key": issue_key,
343
+ "comments_count": len(comments),
344
+ "text_length": len(combined_text),
345
+ }
346
+ ),
347
+ name=f"Comments for issue {issue_key}",
348
+ external_url=comments_url,
349
+ links=comment_link_hashes,
350
+ asset_type=OutputAssetType.TXT,
351
+ source_id=self.source_id,
352
+ created_at=now,
353
+ updated_at=now,
354
+ runner_id=self.runner_id,
355
+ )
356
+ return asset, [comments_hash], urls
357
+
358
+ def _fetch_issue_comments(self, issue_key: str) -> list[dict[str, Any]]:
359
+ all_comments: list[dict[str, Any]] = []
360
+ start_at = 0
361
+ while True:
362
+ payload = self.client.get_json(
363
+ f"/rest/api/3/issue/{issue_key}/comment",
364
+ params={"startAt": start_at, "maxResults": 100, "orderBy": "created"},
365
+ )
366
+ comments = payload.get("comments", [])
367
+ if isinstance(comments, list):
368
+ all_comments.extend([comment for comment in comments if isinstance(comment, dict)])
369
+
370
+ max_results = int(payload.get("maxResults") or 0)
371
+ total = int(payload.get("total") or len(all_comments))
372
+ start_at += max_results if max_results > 0 else len(comments)
373
+ if start_at >= total or not comments:
374
+ break
375
+ return all_comments
376
+
377
+ def _attachment_assets(
378
+ self,
379
+ issue: dict[str, Any],
380
+ issue_hash: str,
381
+ now: datetime,
382
+ ) -> tuple[list[SingleAssetScanResults], list[str]]:
383
+ if not self.include_attachments:
384
+ return [], []
385
+ fields = issue.get("fields", {})
386
+ if not isinstance(fields, dict):
387
+ return [], []
388
+
389
+ assets: list[SingleAssetScanResults] = []
390
+ hashes: list[str] = []
391
+ attachments = fields.get("attachment")
392
+ if not isinstance(attachments, list):
393
+ return assets, hashes
394
+
395
+ for attachment in attachments:
396
+ if not isinstance(attachment, dict):
397
+ continue
398
+ content_url = attachment.get("content")
399
+ if not isinstance(content_url, str):
400
+ continue
401
+ normalized_url = normalize_http_url(content_url, base_url=self.base_url)
402
+ if not normalized_url:
403
+ continue
404
+
405
+ attachment_hash = self.generate_hash_id(normalized_url)
406
+ self._attachment_url_by_hash[attachment_hash] = normalized_url
407
+ mime = str(attachment.get("mimeType") or "").lower()
408
+ filename = str(attachment.get("filename") or f"attachment-{attachment.get('id')}")
409
+ self._attachment_name_by_hash[attachment_hash] = filename
410
+ metadata = {
411
+ "issue_hash": issue_hash,
412
+ "attachment_id": attachment.get("id"),
413
+ "mime_type": mime,
414
+ "size": attachment.get("size"),
415
+ "filename": filename,
416
+ }
417
+ assets.append(
418
+ SingleAssetScanResults(
419
+ hash=attachment_hash,
420
+ checksum=self.calculate_checksum(metadata),
421
+ name=filename,
422
+ external_url=normalized_url,
423
+ links=[],
424
+ asset_type=self._asset_type_from_mime_or_name(mime, filename),
425
+ source_id=self.source_id,
426
+ created_at=now,
427
+ updated_at=now,
428
+ runner_id=self.runner_id,
429
+ )
430
+ )
431
+ hashes.append(attachment_hash)
432
+ return assets, hashes
433
+
434
+ def _linked_issue_hashes(self, links: Any) -> list[str]:
435
+ if not isinstance(links, list):
436
+ return []
437
+ hashes: list[str] = []
438
+ for link in links:
439
+ if not isinstance(link, dict):
440
+ continue
441
+ for side in ("inwardIssue", "outwardIssue"):
442
+ issue_obj = link.get(side)
443
+ if not isinstance(issue_obj, dict):
444
+ continue
445
+ issue_key = issue_obj.get("key")
446
+ if not isinstance(issue_key, str):
447
+ continue
448
+ linked_url = f"{self.base_url}/browse/{issue_key}"
449
+ hashes.append(self.generate_hash_id(linked_url))
450
+ return dedupe_preserve_order(hashes)
451
+
452
+ def _text_and_urls_from_adf(self, value: Any) -> tuple[str, list[str]]:
453
+ if isinstance(value, str):
454
+ return value, extract_urls_from_text(value)
455
+ text, urls = parse_atlassian_document(value)
456
+ return text, dedupe_preserve_order(urls + extract_urls_from_text(text))
457
+
458
+ def _value_name(self, value: Any) -> str:
459
+ if isinstance(value, dict):
460
+ name = value.get("name")
461
+ if isinstance(name, str):
462
+ return name
463
+ return str(value or "")
464
+
465
+ def _asset_type_from_mime_or_name(
466
+ self,
467
+ mime_type: str,
468
+ file_name: str,
469
+ ) -> OutputAssetType:
470
+ mime_asset_type = self._asset_type_from_mime(mime_type)
471
+ if mime_asset_type != OutputAssetType.BINARY:
472
+ return mime_asset_type
473
+ if is_tabular_filename(file_name):
474
+ return OutputAssetType.TABLE
475
+ return mime_asset_type
476
+
477
+ def _asset_type_from_mime(self, mime_type: str) -> OutputAssetType:
478
+ normalized = mime_type.lower()
479
+ if normalized.startswith("image/"):
480
+ return OutputAssetType.IMAGE
481
+ if normalized.startswith("video/"):
482
+ return OutputAssetType.VIDEO
483
+ if normalized.startswith("audio/"):
484
+ return OutputAssetType.AUDIO
485
+ if is_tabular_mime_type(normalized):
486
+ return OutputAssetType.TABLE
487
+ if normalized in {
488
+ "text/plain",
489
+ "application/json",
490
+ "application/xml",
491
+ "text/xml",
492
+ }:
493
+ return OutputAssetType.TXT
494
+ if normalized == "text/html":
495
+ return OutputAssetType.URL
496
+ return OutputAssetType.BINARY
497
+
498
+ def _add_asset_if_new(
499
+ self,
500
+ assets: list[SingleAssetScanResults],
501
+ asset: SingleAssetScanResults,
502
+ ) -> bool:
503
+ if asset.hash in self._seen_asset_hashes:
504
+ return False
505
+ self._seen_asset_hashes.add(asset.hash)
506
+ assets.append(asset)
507
+ return True
508
+
509
+ async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
510
+ normalized = normalize_http_url(asset_id, base_url=self.base_url)
511
+ if normalized:
512
+ asset_hash = self.generate_hash_id(normalized)
513
+ asset_id = asset_hash
514
+
515
+ attachment_url = self._attachment_url_by_hash.get(asset_id) or self._hash_to_url.get(
516
+ asset_id
517
+ )
518
+ if not attachment_url:
519
+ return None
520
+
521
+ try:
522
+ file_bytes, declared_mime = self.client.get_bytes(attachment_url)
523
+ except Exception as exc:
524
+ logger.warning("Failed to fetch Jira attachment bytes for %s: %s", attachment_url, exc)
525
+ return None
526
+
527
+ if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
528
+ file_bytes = file_bytes[: self.attachment_max_bytes]
529
+
530
+ mime_type = resolve_mime_type(
531
+ file_bytes,
532
+ declared_mime_type=declared_mime,
533
+ file_name=self._attachment_file_name(asset_id, attachment_url),
534
+ )
535
+ return file_bytes, mime_type
536
+
537
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
538
+ cached = self._asset_content_cache.get(asset_id)
539
+ if cached:
540
+ return cached
541
+
542
+ normalized = normalize_http_url(asset_id, base_url=self.base_url)
543
+ if normalized:
544
+ asset_hash = self.generate_hash_id(normalized)
545
+ cached = self._asset_content_cache.get(asset_hash)
546
+ if cached:
547
+ return cached
548
+ asset_id = asset_hash
549
+
550
+ attachment_url = self._attachment_url_by_hash.get(asset_id) or self._hash_to_url.get(
551
+ asset_id
552
+ )
553
+ if not attachment_url:
554
+ return None
555
+
556
+ try:
557
+ file_bytes, declared_mime = self.client.get_bytes(attachment_url)
558
+ except Exception as exc:
559
+ logger.warning("Failed to fetch Jira attachment %s: %s", attachment_url, exc)
560
+ return None
561
+
562
+ if self.attachment_max_bytes > 0 and len(file_bytes) > self.attachment_max_bytes:
563
+ file_bytes = file_bytes[: self.attachment_max_bytes]
564
+
565
+ parsed = self.parse_asset_bytes(
566
+ file_bytes,
567
+ declared_mime_type=declared_mime,
568
+ file_name=self._attachment_file_name(asset_id, attachment_url),
569
+ )
570
+
571
+ if parsed.text_content:
572
+ self._asset_content_cache[asset_id] = (parsed.raw_content, parsed.text_content)
573
+ return parsed.raw_content, parsed.text_content
574
+ return None
575
+
576
+ def generate_hash_id(self, asset_id: str) -> str:
577
+ normalized = normalize_http_url(asset_id, base_url=self.base_url)
578
+ if not normalized:
579
+ raise ValueError(f"Invalid URL for hash: {asset_id}")
580
+ asset_hash = hash_url(normalized, base_url=self.base_url)
581
+ self._hash_to_url[asset_hash] = normalized
582
+ return asset_hash
583
+
584
+ def resolve_link_for_detection(self, link: str) -> str | None:
585
+ mapped = self._hash_to_url.get(link)
586
+ if mapped:
587
+ return mapped
588
+ return normalize_http_url(link)
589
+
590
+ def enrich_finding_location(
591
+ self,
592
+ finding: DetectionResult,
593
+ asset: SingleAssetScanResults,
594
+ text_content: str,
595
+ ) -> None:
596
+ _ = text_content
597
+ finding.location = Location(path=asset.external_url)
598
+
599
+ def abort(self) -> None:
600
+ logger.info("Aborting Jira extraction...")
601
+ super().abort()
602
+ self.client.close()
603
+
604
+ def cleanup(self) -> None:
605
+ self.client.close()
@@ -0,0 +1,3 @@
1
+ from .source import MongoDBSource
2
+
3
+ __all__ = ["MongoDBSource"]