classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
_VALID_SAMPLING_STRATEGIES = {"RANDOM", "LATEST", "ALL"}
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _as_dict(value: Any) -> dict[str, Any]:
|
|
10
|
+
if isinstance(value, dict):
|
|
11
|
+
return value
|
|
12
|
+
return {}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _as_positive_int(value: Any) -> int | None:
|
|
16
|
+
try:
|
|
17
|
+
if value is None:
|
|
18
|
+
return None
|
|
19
|
+
parsed = int(value)
|
|
20
|
+
except (TypeError, ValueError):
|
|
21
|
+
return None
|
|
22
|
+
return parsed if parsed > 0 else None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _normalize_sampling_strategy(value: Any) -> str | None:
|
|
26
|
+
if not isinstance(value, str):
|
|
27
|
+
return None
|
|
28
|
+
normalized = value.strip().upper()
|
|
29
|
+
return normalized if normalized in _VALID_SAMPLING_STRATEGIES else None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _as_bool(value: Any) -> bool | None:
|
|
33
|
+
if isinstance(value, bool):
|
|
34
|
+
return value
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _pick(*values: Any) -> Any:
|
|
39
|
+
for value in values:
|
|
40
|
+
if value is not None:
|
|
41
|
+
return value
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _normalize_object_storage_shape(
|
|
46
|
+
normalized: dict[str, Any],
|
|
47
|
+
source_type_value: str,
|
|
48
|
+
) -> None:
|
|
49
|
+
required = _as_dict(normalized.get("required"))
|
|
50
|
+
optional = _as_dict(normalized.get("optional"))
|
|
51
|
+
optional_connection = _as_dict(optional.get("connection"))
|
|
52
|
+
optional_scope = _as_dict(optional.get("scope"))
|
|
53
|
+
|
|
54
|
+
for key in (
|
|
55
|
+
"request_timeout_seconds",
|
|
56
|
+
"max_keys_per_page",
|
|
57
|
+
"max_object_bytes",
|
|
58
|
+
):
|
|
59
|
+
value = normalized.get(key)
|
|
60
|
+
if value is None:
|
|
61
|
+
continue
|
|
62
|
+
optional_connection.setdefault(key, value)
|
|
63
|
+
normalized.pop(key, None)
|
|
64
|
+
|
|
65
|
+
for key in (
|
|
66
|
+
"prefix",
|
|
67
|
+
"include_extensions",
|
|
68
|
+
"exclude_extensions",
|
|
69
|
+
"include_empty_objects",
|
|
70
|
+
"include_object_metadata",
|
|
71
|
+
"include_content_preview",
|
|
72
|
+
):
|
|
73
|
+
value = normalized.get(key)
|
|
74
|
+
if value is None:
|
|
75
|
+
continue
|
|
76
|
+
optional_scope.setdefault(key, value)
|
|
77
|
+
normalized.pop(key, None)
|
|
78
|
+
|
|
79
|
+
if source_type_value == "S3_COMPATIBLE_STORAGE":
|
|
80
|
+
if (bucket := normalized.pop("bucket", None)) is not None:
|
|
81
|
+
required.setdefault("bucket", bucket)
|
|
82
|
+
for key in ("endpoint_url", "region_name", "verify_ssl"):
|
|
83
|
+
value = normalized.get(key)
|
|
84
|
+
if value is None:
|
|
85
|
+
continue
|
|
86
|
+
optional_connection.setdefault(key, value)
|
|
87
|
+
normalized.pop(key, None)
|
|
88
|
+
|
|
89
|
+
if source_type_value == "AZURE_BLOB_STORAGE":
|
|
90
|
+
if (account_url := normalized.pop("account_url", None)) is not None:
|
|
91
|
+
required.setdefault("account_url", account_url)
|
|
92
|
+
if (container := normalized.pop("container", None)) is not None:
|
|
93
|
+
required.setdefault("container", container)
|
|
94
|
+
|
|
95
|
+
if source_type_value == "GOOGLE_CLOUD_STORAGE":
|
|
96
|
+
if (bucket := normalized.pop("bucket", None)) is not None:
|
|
97
|
+
required.setdefault("bucket", bucket)
|
|
98
|
+
for key in ("project_id", "gcp_credentials_file"):
|
|
99
|
+
value = normalized.get(key)
|
|
100
|
+
if value is None:
|
|
101
|
+
continue
|
|
102
|
+
optional_connection.setdefault(key, value)
|
|
103
|
+
normalized.pop(key, None)
|
|
104
|
+
|
|
105
|
+
required.pop("provider", None)
|
|
106
|
+
normalized["required"] = required
|
|
107
|
+
if optional_connection:
|
|
108
|
+
optional["connection"] = optional_connection
|
|
109
|
+
if optional_scope:
|
|
110
|
+
optional["scope"] = optional_scope
|
|
111
|
+
if optional:
|
|
112
|
+
normalized["optional"] = optional
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def normalize_source_recipe(
|
|
116
|
+
recipe: dict[str, Any],
|
|
117
|
+
source_type: str | None = None,
|
|
118
|
+
) -> dict[str, Any]:
|
|
119
|
+
normalized = deepcopy(recipe)
|
|
120
|
+
source_type_value = str(source_type or normalized.get("type") or "").upper()
|
|
121
|
+
|
|
122
|
+
if source_type_value:
|
|
123
|
+
normalized["type"] = source_type_value
|
|
124
|
+
|
|
125
|
+
optional = _as_dict(normalized.get("optional"))
|
|
126
|
+
optional_sampling = _as_dict(optional.get("sampling"))
|
|
127
|
+
sampling = _as_dict(normalized.get("sampling"))
|
|
128
|
+
|
|
129
|
+
strategy = _pick(
|
|
130
|
+
_normalize_sampling_strategy(sampling.get("strategy")),
|
|
131
|
+
_normalize_sampling_strategy(optional_sampling.get("strategy")),
|
|
132
|
+
_normalize_sampling_strategy(optional_sampling.get("mode")),
|
|
133
|
+
"RANDOM",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
sampling["strategy"] = strategy
|
|
137
|
+
# Strip removed fields so legacy recipes with limit/max_columns don't fail validation
|
|
138
|
+
sampling.pop("limit", None)
|
|
139
|
+
sampling.pop("max_columns", None)
|
|
140
|
+
|
|
141
|
+
for key in (
|
|
142
|
+
"order_by_column",
|
|
143
|
+
"fallback_to_random",
|
|
144
|
+
"rows_per_page",
|
|
145
|
+
"include_column_names",
|
|
146
|
+
):
|
|
147
|
+
if key not in sampling and key in optional_sampling:
|
|
148
|
+
sampling[key] = optional_sampling[key]
|
|
149
|
+
|
|
150
|
+
sampling.pop("fetch_all_until_first_success", None)
|
|
151
|
+
|
|
152
|
+
normalized["sampling"] = sampling
|
|
153
|
+
|
|
154
|
+
optional.pop("sampling", None)
|
|
155
|
+
|
|
156
|
+
if optional:
|
|
157
|
+
normalized["optional"] = optional
|
|
158
|
+
|
|
159
|
+
if source_type_value == "WORDPRESS":
|
|
160
|
+
required = _as_dict(normalized.get("required"))
|
|
161
|
+
if isinstance(normalized.get("url"), str):
|
|
162
|
+
required.setdefault("url", normalized.pop("url"))
|
|
163
|
+
normalized["required"] = required
|
|
164
|
+
normalized.setdefault("masked", _as_dict(normalized.get("masked")))
|
|
165
|
+
|
|
166
|
+
if source_type_value == "SLACK":
|
|
167
|
+
required = _as_dict(normalized.get("required"))
|
|
168
|
+
if isinstance(normalized.get("workspace"), str):
|
|
169
|
+
required.setdefault("workspace", normalized.pop("workspace"))
|
|
170
|
+
normalized["required"] = required
|
|
171
|
+
|
|
172
|
+
if source_type_value in {
|
|
173
|
+
"S3_COMPATIBLE_STORAGE",
|
|
174
|
+
"AZURE_BLOB_STORAGE",
|
|
175
|
+
"GOOGLE_CLOUD_STORAGE",
|
|
176
|
+
}:
|
|
177
|
+
_normalize_object_storage_shape(normalized, source_type_value)
|
|
178
|
+
|
|
179
|
+
return normalized
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# S3-Compatible Storage Source
|
|
2
|
+
|
|
3
|
+
This source uses the `S3_COMPATIBLE_STORAGE` schema and reads credentials from `config.masked`.
|
|
4
|
+
|
|
5
|
+
## Schema Field Mapping
|
|
6
|
+
|
|
7
|
+
Use these schema paths when configuring any S3-compatible provider:
|
|
8
|
+
|
|
9
|
+
- `config.required.bucket`: Bucket name (for example `testclassifyrebucket`)
|
|
10
|
+
- `config.masked.aws_access_key_id`: Access key ID (not key name)
|
|
11
|
+
- `config.masked.aws_secret_access_key`: Secret access key
|
|
12
|
+
- `config.masked.aws_session_token`: Optional temporary session token
|
|
13
|
+
- `config.optional.connection.endpoint_url`: Provider S3 endpoint URL
|
|
14
|
+
- `config.optional.connection.region_name`: Region used for SigV4 signing
|
|
15
|
+
- `config.optional.connection.verify_ssl`: TLS verification toggle (default `true`)
|
|
16
|
+
- `config.optional.connection.request_timeout_seconds`: Network timeout for list/download calls
|
|
17
|
+
- `config.optional.connection.max_keys_per_page`: Max objects per list page
|
|
18
|
+
- `config.optional.connection.max_object_bytes`: Max bytes downloaded per object for preview/extraction
|
|
19
|
+
- `config.optional.scope.prefix`: Optional prefix filter
|
|
20
|
+
- `config.optional.scope.include_extensions`: Optional include extension filter
|
|
21
|
+
- `config.optional.scope.exclude_extensions`: Optional exclude extension filter
|
|
22
|
+
- `config.optional.scope.include_empty_objects`: Include zero-byte objects
|
|
23
|
+
- `config.optional.scope.include_object_metadata`: Include object metadata
|
|
24
|
+
- `config.optional.scope.include_content_preview`: Download content for MIME/text preview
|
|
25
|
+
- `config.sampling.strategy`: `RANDOM`, `LATEST`, or `ALL`
|
|
26
|
+
- `config.sampling.rows_per_page`: Item limit per sample run (default 100, ignored when strategy is `ALL`)
|
|
27
|
+
|
|
28
|
+
## Backblaze B2 Example
|
|
29
|
+
|
|
30
|
+
```json
|
|
31
|
+
{
|
|
32
|
+
"type": "S3_COMPATIBLE_STORAGE",
|
|
33
|
+
"required": {
|
|
34
|
+
"bucket": "testclassifyrebucket"
|
|
35
|
+
},
|
|
36
|
+
"masked": {
|
|
37
|
+
"aws_access_key_id": "002b0e7121683000000000001",
|
|
38
|
+
"aws_secret_access_key": "K002ZQTPmV9xMTEGg/F3AGtDFzdZgnY"
|
|
39
|
+
},
|
|
40
|
+
"optional": {
|
|
41
|
+
"connection": {
|
|
42
|
+
"endpoint_url": "https://s3.us-west-002.backblazeb2.com",
|
|
43
|
+
"region_name": "us-west-002",
|
|
44
|
+
"verify_ssl": true,
|
|
45
|
+
"request_timeout_seconds": 30
|
|
46
|
+
},
|
|
47
|
+
"scope": {
|
|
48
|
+
"include_empty_objects": false,
|
|
49
|
+
"include_object_metadata": true,
|
|
50
|
+
"include_content_preview": true
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"sampling": {
|
|
54
|
+
"strategy": "LATEST"
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Common Misconfigurations
|
|
60
|
+
|
|
61
|
+
- `InvalidRequest: The Credential is malformed`:
|
|
62
|
+
- `aws_access_key_id` is wrong (often secret key pasted into this field).
|
|
63
|
+
- `InvalidAccessKeyId`:
|
|
64
|
+
- Access key ID does not exist for this B2 application key.
|
|
65
|
+
- `SUCCESS` but `Listed 0 object(s)`:
|
|
66
|
+
- `optional.scope.prefix` does not match actual key paths, or extension filters exclude files.
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.parse import quote
|
|
7
|
+
|
|
8
|
+
from ...models.generated_input import S3CompatibleStorageInput
|
|
9
|
+
from ..dependencies import require_module
|
|
10
|
+
from ..object_storage.base import ObjectRef, ObjectStorageSourceBase
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class S3CompatibleStorageSource(ObjectStorageSourceBase):
|
|
16
|
+
source_type = "s3_compatible_storage"
|
|
17
|
+
provider_label = "S3_COMPATIBLE_STORAGE"
|
|
18
|
+
input_model = S3CompatibleStorageInput
|
|
19
|
+
|
|
20
|
+
def _required_bucket(self) -> str:
|
|
21
|
+
bucket = str(self.config.required.bucket).strip()
|
|
22
|
+
if not bucket:
|
|
23
|
+
raise ValueError("required.bucket must be set")
|
|
24
|
+
return bucket
|
|
25
|
+
|
|
26
|
+
def _build_client(self) -> Any:
|
|
27
|
+
boto3 = require_module(
|
|
28
|
+
module_name="boto3",
|
|
29
|
+
source_name="S3 Compatible Storage",
|
|
30
|
+
uv_groups=["s3-compatible-storage"],
|
|
31
|
+
detail="S3-compatible storage requires boto3.",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
kwargs: dict[str, Any] = {}
|
|
35
|
+
region_name = self._string_or_none(self._connection_option("region_name"))
|
|
36
|
+
endpoint_url = self._string_or_none(self._connection_option("endpoint_url"))
|
|
37
|
+
aws_access_key_id = self._masked_value("aws_access_key_id")
|
|
38
|
+
aws_secret_access_key = self._masked_value("aws_secret_access_key")
|
|
39
|
+
aws_session_token = self._masked_value("aws_session_token")
|
|
40
|
+
|
|
41
|
+
if region_name:
|
|
42
|
+
kwargs["region_name"] = region_name
|
|
43
|
+
if endpoint_url:
|
|
44
|
+
kwargs["endpoint_url"] = endpoint_url
|
|
45
|
+
if aws_access_key_id and aws_secret_access_key:
|
|
46
|
+
kwargs["aws_access_key_id"] = aws_access_key_id
|
|
47
|
+
kwargs["aws_secret_access_key"] = aws_secret_access_key
|
|
48
|
+
if aws_session_token:
|
|
49
|
+
kwargs["aws_session_token"] = aws_session_token
|
|
50
|
+
|
|
51
|
+
kwargs["verify"] = self._verify_ssl()
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
botocore_config = require_module(
|
|
55
|
+
module_name="botocore.config",
|
|
56
|
+
source_name="S3 Compatible Storage",
|
|
57
|
+
uv_groups=["s3-compatible-storage"],
|
|
58
|
+
detail="S3-compatible storage uses botocore timeout configuration.",
|
|
59
|
+
)
|
|
60
|
+
timeout = int(self._request_timeout_seconds())
|
|
61
|
+
kwargs["config"] = botocore_config.Config(
|
|
62
|
+
connect_timeout=timeout,
|
|
63
|
+
read_timeout=timeout,
|
|
64
|
+
)
|
|
65
|
+
except Exception:
|
|
66
|
+
logger.debug("Could not initialize botocore timeout configuration; using defaults")
|
|
67
|
+
|
|
68
|
+
return boto3.client("s3", **kwargs)
|
|
69
|
+
|
|
70
|
+
def _client(self) -> Any:
|
|
71
|
+
if self._cached_client is None:
|
|
72
|
+
self._cached_client = self._build_client()
|
|
73
|
+
return self._cached_client
|
|
74
|
+
|
|
75
|
+
def _list_objects(self) -> Iterator[ObjectRef]:
|
|
76
|
+
client = self._client()
|
|
77
|
+
bucket = self._required_bucket()
|
|
78
|
+
prefix = self._prefix()
|
|
79
|
+
max_keys = self._max_keys_per_page()
|
|
80
|
+
|
|
81
|
+
continuation_token: str | None = None
|
|
82
|
+
|
|
83
|
+
while True:
|
|
84
|
+
params: dict[str, Any] = {
|
|
85
|
+
"Bucket": bucket,
|
|
86
|
+
"MaxKeys": max_keys,
|
|
87
|
+
}
|
|
88
|
+
if prefix:
|
|
89
|
+
params["Prefix"] = prefix
|
|
90
|
+
if continuation_token:
|
|
91
|
+
params["ContinuationToken"] = continuation_token
|
|
92
|
+
|
|
93
|
+
response = client.list_objects_v2(**params)
|
|
94
|
+
for item in response.get("Contents", []) or []:
|
|
95
|
+
key = str(item.get("Key") or "")
|
|
96
|
+
if not key or key.endswith("/"):
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
size = int(item.get("Size") or 0)
|
|
100
|
+
if size == 0 and not self._include_empty_objects():
|
|
101
|
+
continue
|
|
102
|
+
if not self._object_matches_extension_filters(key):
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
yield ObjectRef(
|
|
106
|
+
key=key,
|
|
107
|
+
size=size,
|
|
108
|
+
last_modified=self._parse_datetime(item.get("LastModified")),
|
|
109
|
+
etag=str(item.get("ETag")).strip('"') if item.get("ETag") else None,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if not response.get("IsTruncated"):
|
|
113
|
+
break
|
|
114
|
+
continuation_token = response.get("NextContinuationToken")
|
|
115
|
+
if not continuation_token:
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
def _download_object(self, ref: ObjectRef) -> tuple[bytes, str | None, bool]:
|
|
119
|
+
client = self._client()
|
|
120
|
+
bucket = self._required_bucket()
|
|
121
|
+
max_bytes = self._max_object_bytes()
|
|
122
|
+
|
|
123
|
+
params: dict[str, Any] = {"Bucket": bucket, "Key": ref.key}
|
|
124
|
+
truncated = False
|
|
125
|
+
if ref.size > max_bytes:
|
|
126
|
+
params["Range"] = f"bytes=0-{max_bytes - 1}"
|
|
127
|
+
truncated = True
|
|
128
|
+
|
|
129
|
+
response = client.get_object(**params)
|
|
130
|
+
body = response["Body"]
|
|
131
|
+
try:
|
|
132
|
+
file_bytes = body.read()
|
|
133
|
+
finally:
|
|
134
|
+
try:
|
|
135
|
+
body.close()
|
|
136
|
+
except Exception:
|
|
137
|
+
logger.debug("Failed to close S3 response body")
|
|
138
|
+
|
|
139
|
+
content_type = response.get("ContentType")
|
|
140
|
+
return file_bytes, str(content_type) if content_type else None, truncated
|
|
141
|
+
|
|
142
|
+
def _external_url(self, key: str) -> str:
|
|
143
|
+
bucket = self._required_bucket()
|
|
144
|
+
endpoint_url = self._string_or_none(self._connection_option("endpoint_url"))
|
|
145
|
+
if endpoint_url:
|
|
146
|
+
endpoint = endpoint_url.rstrip("/")
|
|
147
|
+
encoded_bucket = quote(bucket, safe="")
|
|
148
|
+
encoded_key = quote(key, safe="/")
|
|
149
|
+
return f"{endpoint}/{encoded_bucket}/{encoded_key}"
|
|
150
|
+
return f"s3://{bucket}/{key}"
|