groundx 2.4.4__py3-none-any.whl → 2.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of groundx might be problematic. Click here for more details.
- groundx/core/client_wrapper.py +2 -2
- groundx/extract/__init__.py +38 -0
- groundx/extract/agents/__init__.py +7 -0
- groundx/extract/agents/agent.py +202 -0
- groundx/extract/classes/__init__.py +27 -0
- groundx/extract/classes/agent.py +22 -0
- groundx/extract/classes/api.py +15 -0
- groundx/extract/classes/document.py +311 -0
- groundx/extract/classes/field.py +88 -0
- groundx/extract/classes/groundx.py +123 -0
- groundx/extract/classes/post_process.py +33 -0
- groundx/extract/classes/prompt.py +36 -0
- groundx/extract/classes/settings.py +169 -0
- groundx/extract/classes/test_document.py +126 -0
- groundx/extract/classes/test_field.py +43 -0
- groundx/extract/classes/test_groundx.py +188 -0
- groundx/extract/classes/test_prompt.py +68 -0
- groundx/extract/classes/test_settings.py +515 -0
- groundx/extract/classes/test_utility.py +81 -0
- groundx/extract/classes/utility.py +193 -0
- groundx/extract/services/.DS_Store +0 -0
- groundx/extract/services/__init__.py +14 -0
- groundx/extract/services/csv.py +76 -0
- groundx/extract/services/logger.py +127 -0
- groundx/extract/services/logging_cfg.py +55 -0
- groundx/extract/services/ratelimit.py +104 -0
- groundx/extract/services/sheets_client.py +160 -0
- groundx/extract/services/status.py +197 -0
- groundx/extract/services/upload.py +73 -0
- groundx/extract/services/upload_minio.py +122 -0
- groundx/extract/services/upload_s3.py +84 -0
- groundx/extract/services/utility.py +52 -0
- {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/METADATA +1 -1
- {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/RECORD +36 -5
- {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/LICENSE +0 -0
- {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import json, os, typing
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from google.oauth2 import service_account
|
|
5
|
+
from googleapiclient.discovery import (
|
|
6
|
+
build, # pyright: ignore[reportUnknownVariableType]
|
|
7
|
+
)
|
|
8
|
+
import gspread
|
|
9
|
+
|
|
10
|
+
from ..classes.settings import ContainerSettings, GCP_CREDENTIALS
|
|
11
|
+
|
|
12
|
+
SPREADSHEET_MIME = "application/vnd.google-apps.spreadsheet"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SheetsClient:
|
|
16
|
+
client: gspread.Client
|
|
17
|
+
# drive: DriveResource
|
|
18
|
+
drive: typing.Any
|
|
19
|
+
settings: ContainerSettings
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
settings: ContainerSettings,
|
|
24
|
+
scopes: typing.Optional[typing.List[str]] = None,
|
|
25
|
+
):
|
|
26
|
+
self.scopes: typing.List[str] = scopes or [
|
|
27
|
+
"https://www.googleapis.com/auth/spreadsheets",
|
|
28
|
+
"https://www.googleapis.com/auth/drive",
|
|
29
|
+
]
|
|
30
|
+
self.settings = settings
|
|
31
|
+
|
|
32
|
+
creds_dict = _load_credentials_from_env()
|
|
33
|
+
if not creds_dict:
|
|
34
|
+
raise ValueError(f"{GCP_CREDENTIALS} does not load valid credentials")
|
|
35
|
+
|
|
36
|
+
creds = service_account.Credentials.from_service_account_info(
|
|
37
|
+
creds_dict, scopes=scopes
|
|
38
|
+
)
|
|
39
|
+
self.drive = build("drive", "v3", credentials=creds)
|
|
40
|
+
|
|
41
|
+
auth_scopes = self.scopes
|
|
42
|
+
if scopes:
|
|
43
|
+
auth_scopes = scopes
|
|
44
|
+
|
|
45
|
+
self.client = gspread.service_account_from_dict(creds_dict, scopes=auth_scopes)
|
|
46
|
+
|
|
47
|
+
def create_headers_if_missing(
|
|
48
|
+
self, ws: gspread.Worksheet, headers: typing.List[str]
|
|
49
|
+
) -> None:
|
|
50
|
+
existing = ws.row_values(1)
|
|
51
|
+
if not existing:
|
|
52
|
+
ws.insert_row(headers, 1)
|
|
53
|
+
|
|
54
|
+
def find_sheet_by_name(
|
|
55
|
+
self,
|
|
56
|
+
spreadsheet_name: str,
|
|
57
|
+
drive_id: str,
|
|
58
|
+
) -> typing.Optional[str]:
|
|
59
|
+
cln = spreadsheet_name.replace("'", "\\'")
|
|
60
|
+
|
|
61
|
+
q = f"name = '{cln}' and mimeType = '{SPREADSHEET_MIME}' and trashed = false"
|
|
62
|
+
|
|
63
|
+
resp = (
|
|
64
|
+
self.drive.files()
|
|
65
|
+
.list(
|
|
66
|
+
q=q,
|
|
67
|
+
corpora="drive",
|
|
68
|
+
driveId=drive_id,
|
|
69
|
+
includeItemsFromAllDrives=True,
|
|
70
|
+
supportsAllDrives=True,
|
|
71
|
+
fields="files(id, name)",
|
|
72
|
+
)
|
|
73
|
+
.execute()
|
|
74
|
+
)
|
|
75
|
+
files = resp.get("files", [])
|
|
76
|
+
return files[0].get("id") if files else None
|
|
77
|
+
|
|
78
|
+
def open_or_create_spreadsheet(
|
|
79
|
+
self,
|
|
80
|
+
spreadsheet_name: str,
|
|
81
|
+
drive_id: str,
|
|
82
|
+
sheet_1_title: typing.Optional[str] = None,
|
|
83
|
+
) -> gspread.Spreadsheet:
|
|
84
|
+
file_id = self.find_sheet_by_name(spreadsheet_name, drive_id)
|
|
85
|
+
if file_id:
|
|
86
|
+
return self.client.open_by_key(file_id)
|
|
87
|
+
|
|
88
|
+
if self.settings.google_sheets_template_id:
|
|
89
|
+
created = (
|
|
90
|
+
self.drive.files()
|
|
91
|
+
.copy(
|
|
92
|
+
fileId=self.settings.google_sheets_template_id,
|
|
93
|
+
body={"name": spreadsheet_name, "parents": [drive_id]},
|
|
94
|
+
supportsAllDrives=True,
|
|
95
|
+
fields="id,name,parents,driveId",
|
|
96
|
+
)
|
|
97
|
+
.execute()
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
created = (
|
|
101
|
+
self.drive.files()
|
|
102
|
+
.create(
|
|
103
|
+
body={
|
|
104
|
+
"name": spreadsheet_name,
|
|
105
|
+
"mimeType": SPREADSHEET_MIME,
|
|
106
|
+
"parents": [drive_id],
|
|
107
|
+
},
|
|
108
|
+
supportsAllDrives=True,
|
|
109
|
+
fields="id,name,parents,driveId",
|
|
110
|
+
)
|
|
111
|
+
.execute()
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
cid = created.get("id")
|
|
115
|
+
if not cid:
|
|
116
|
+
raise Exception(f"create spreadsheet failed\n{created}")
|
|
117
|
+
|
|
118
|
+
sh = self.client.open_by_key(cid)
|
|
119
|
+
|
|
120
|
+
if sheet_1_title:
|
|
121
|
+
sh.sheet1.update_title(sheet_1_title)
|
|
122
|
+
|
|
123
|
+
return sh
|
|
124
|
+
|
|
125
|
+
def open_or_create_worksheet(
|
|
126
|
+
self,
|
|
127
|
+
sh: gspread.Spreadsheet,
|
|
128
|
+
title: str,
|
|
129
|
+
headers: typing.List[str],
|
|
130
|
+
rows: int = 1000,
|
|
131
|
+
) -> gspread.Worksheet:
|
|
132
|
+
cols = len(headers)
|
|
133
|
+
try:
|
|
134
|
+
ws = sh.worksheet(title)
|
|
135
|
+
self.create_headers_if_missing(ws, headers)
|
|
136
|
+
except gspread.WorksheetNotFound:
|
|
137
|
+
ws = sh.add_worksheet(title=title, rows=rows, cols=cols)
|
|
138
|
+
ws.append_row(headers)
|
|
139
|
+
|
|
140
|
+
return ws
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _load_credentials_from_env() -> typing.Optional[typing.Dict[str, typing.Any]]:
|
|
144
|
+
raw = os.environ.get(GCP_CREDENTIALS)
|
|
145
|
+
if not raw:
|
|
146
|
+
if Path("./gcv.json").exists():
|
|
147
|
+
with open("./gcv.json") as f:
|
|
148
|
+
data = f.read()
|
|
149
|
+
return json.loads(data)
|
|
150
|
+
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
creds = json.loads(raw)
|
|
155
|
+
if not isinstance(creds, dict):
|
|
156
|
+
raise ValueError(f"{GCP_CREDENTIALS} is not type dict [{type(creds)}]")
|
|
157
|
+
|
|
158
|
+
return typing.cast(typing.Dict[str, typing.Any], creds)
|
|
159
|
+
except Exception as e:
|
|
160
|
+
raise ValueError(f"{GCP_CREDENTIALS} is set but not valid JSON: {e}") from e
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from fastapi import Response
|
|
4
|
+
|
|
5
|
+
from ..classes.settings import ContainerSettings
|
|
6
|
+
from .logger import Logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Status:
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
cfg: ContainerSettings,
|
|
13
|
+
logger: Logger,
|
|
14
|
+
) -> None:
|
|
15
|
+
import redis
|
|
16
|
+
|
|
17
|
+
rl_port = 6379
|
|
18
|
+
rl_host = cfg.status_broker()
|
|
19
|
+
rl_ssl = False
|
|
20
|
+
if rl_host.endswith("/0"):
|
|
21
|
+
rl_host = rl_host[:-2]
|
|
22
|
+
if rl_host.startswith("redis://"):
|
|
23
|
+
rl_host = rl_host[8:]
|
|
24
|
+
elif rl_host.startswith("rediss://"):
|
|
25
|
+
rl_host = rl_host[9:]
|
|
26
|
+
rl_ssl = True
|
|
27
|
+
if ":" in rl_host:
|
|
28
|
+
base, number = rl_host.rsplit(":", 1)
|
|
29
|
+
if number.isdigit():
|
|
30
|
+
rl_port = int(number)
|
|
31
|
+
rl_host = base
|
|
32
|
+
|
|
33
|
+
self.client = redis.Redis(
|
|
34
|
+
host=rl_host, port=rl_port, decode_responses=True, ssl=rl_ssl
|
|
35
|
+
)
|
|
36
|
+
self.host = rl_host
|
|
37
|
+
self.port = rl_port
|
|
38
|
+
|
|
39
|
+
self.config = cfg
|
|
40
|
+
self.logger = logger
|
|
41
|
+
|
|
42
|
+
self.logger.info_msg(
|
|
43
|
+
f"\n\t[{self.config.service}] [status.Status.__init__]\n\t\t{self.host}:{self.port}",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def get_worker_state(
|
|
47
|
+
self, id: str, to: typing.Optional[int] = None
|
|
48
|
+
) -> typing.Tuple[typing.Optional[int], int]:
|
|
49
|
+
online = self.client.get(self.key_worker_status(id))
|
|
50
|
+
if online is None or online == "offline":
|
|
51
|
+
return None, self.config.workers
|
|
52
|
+
|
|
53
|
+
key_worker_available = self.key_worker_available(id)
|
|
54
|
+
|
|
55
|
+
current_available = self.client.get(key_worker_available)
|
|
56
|
+
if current_available is None:
|
|
57
|
+
return None, self.config.workers
|
|
58
|
+
|
|
59
|
+
return int(current_available), self.config.workers # type: ignore
|
|
60
|
+
|
|
61
|
+
def get_service_state(self) -> typing.Tuple[int, int]:
|
|
62
|
+
available = 0
|
|
63
|
+
|
|
64
|
+
keys: typing.Iterator[str] = self.client.scan_iter( # type: ignore
|
|
65
|
+
match=f"{self.config.service}:*:requests",
|
|
66
|
+
count=1000,
|
|
67
|
+
)
|
|
68
|
+
for key in keys:
|
|
69
|
+
value = self.client.get(key)
|
|
70
|
+
if value is not None:
|
|
71
|
+
available += int(value) # type: ignore
|
|
72
|
+
|
|
73
|
+
total = 0
|
|
74
|
+
|
|
75
|
+
keys: typing.Iterator[str] = self.client.scan_iter( # type: ignore
|
|
76
|
+
match=f"{self.config.service}:*:total", count=1000
|
|
77
|
+
)
|
|
78
|
+
for key in keys:
|
|
79
|
+
value = self.client.get(key)
|
|
80
|
+
if value is not None:
|
|
81
|
+
total += int(value) # type: ignore
|
|
82
|
+
|
|
83
|
+
if total == 0:
|
|
84
|
+
total = self.config.workers
|
|
85
|
+
|
|
86
|
+
return available, total
|
|
87
|
+
|
|
88
|
+
def key_worker_available(self, id: str) -> str:
|
|
89
|
+
return f"{self.config.service}:{id}:requests"
|
|
90
|
+
|
|
91
|
+
def key_worker_status(self, id: str) -> str:
|
|
92
|
+
return f"{self.config.service}:{id}:status"
|
|
93
|
+
|
|
94
|
+
def key_worker_total(self, id: str) -> str:
|
|
95
|
+
return f"{self.config.service}:{id}:total"
|
|
96
|
+
|
|
97
|
+
def refresh_worker(self, id: str, to: typing.Optional[int] = None) -> None:
|
|
98
|
+
self.refresh_worker_online(id, to)
|
|
99
|
+
self.refresh_worker_total(id, to)
|
|
100
|
+
self.refresh_worker_available(id, to)
|
|
101
|
+
|
|
102
|
+
def refresh_worker_available(
|
|
103
|
+
self, id: str, to: typing.Optional[int] = None
|
|
104
|
+
) -> None:
|
|
105
|
+
key_worker_available = self.key_worker_available(id)
|
|
106
|
+
current_available = self.client.get(key_worker_available)
|
|
107
|
+
if current_available is None:
|
|
108
|
+
self.set_value(key_worker_available, self.config.workers, to)
|
|
109
|
+
else:
|
|
110
|
+
if to is not None:
|
|
111
|
+
if to > 0:
|
|
112
|
+
self.client.expire(key_worker_available, to)
|
|
113
|
+
else:
|
|
114
|
+
self.client.expire(key_worker_available, self.config.cache_to)
|
|
115
|
+
|
|
116
|
+
def refresh_worker_online(self, id: str, to: typing.Optional[int] = None) -> None:
|
|
117
|
+
self.set_worker_online(id, to)
|
|
118
|
+
|
|
119
|
+
def refresh_worker_total(self, id: str, to: typing.Optional[int] = None) -> None:
|
|
120
|
+
self.set_value(self.key_worker_total(id), self.config.workers, to)
|
|
121
|
+
|
|
122
|
+
def set_headers(
|
|
123
|
+
self,
|
|
124
|
+
response: Response,
|
|
125
|
+
id: str,
|
|
126
|
+
available: typing.Optional[int],
|
|
127
|
+
total: typing.Optional[int],
|
|
128
|
+
) -> typing.Any:
|
|
129
|
+
if available is None:
|
|
130
|
+
available = 0
|
|
131
|
+
if total is None:
|
|
132
|
+
total = 0
|
|
133
|
+
|
|
134
|
+
response.headers.update(
|
|
135
|
+
{
|
|
136
|
+
"X-RateLimit-Limit-Requests": str(total),
|
|
137
|
+
"X-RateLimit-Remaining-Requests": str(max(0, available)),
|
|
138
|
+
"X-ID": id,
|
|
139
|
+
}
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return response
|
|
143
|
+
|
|
144
|
+
def set_value(
|
|
145
|
+
self, key: str, value: typing.Union[str, int], to: typing.Optional[int] = None
|
|
146
|
+
) -> None:
|
|
147
|
+
if to is not None:
|
|
148
|
+
if to > 0:
|
|
149
|
+
self.client.set(key, value, ex=to)
|
|
150
|
+
else:
|
|
151
|
+
self.client.set(key, value, ex=self.config.cache_to)
|
|
152
|
+
else:
|
|
153
|
+
self.client.set(key, value, ex=self.config.cache_to)
|
|
154
|
+
|
|
155
|
+
def set_worker_available(self, id: str, to: typing.Optional[int] = None) -> None:
|
|
156
|
+
self.refresh_worker_online(id, to)
|
|
157
|
+
|
|
158
|
+
self.refresh_worker_total(id, to)
|
|
159
|
+
|
|
160
|
+
key_worker_available = self.key_worker_available(id)
|
|
161
|
+
current_available = self.client.get(key_worker_available)
|
|
162
|
+
if current_available is None:
|
|
163
|
+
current_available = self.config.workers
|
|
164
|
+
self.set_value(key_worker_available, current_available, to)
|
|
165
|
+
else:
|
|
166
|
+
self.set_value(
|
|
167
|
+
key_worker_available,
|
|
168
|
+
min(self.config.workers, int(current_available) + 1), # type: ignore
|
|
169
|
+
to,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def set_worker_offline(self, id: str, to: typing.Optional[int] = None) -> None:
|
|
173
|
+
if to is None:
|
|
174
|
+
to = self.config.cache_to
|
|
175
|
+
self.logger.info_msg(f"\n\n\t\t[{self.config.service}] offline [{id}]\n")
|
|
176
|
+
self.set_value(self.key_worker_status(id), "offline", to)
|
|
177
|
+
self.set_worker_unavailable(id, to)
|
|
178
|
+
|
|
179
|
+
def set_worker_online(self, id: str, to: typing.Optional[int] = None) -> None:
|
|
180
|
+
self.set_value(self.key_worker_status(id), "online", to)
|
|
181
|
+
|
|
182
|
+
def set_worker_unavailable(self, id: str, to: typing.Optional[int] = None) -> None:
|
|
183
|
+
self.refresh_worker_online(id, to)
|
|
184
|
+
|
|
185
|
+
self.set_value(self.key_worker_total(id), self.config.workers, to)
|
|
186
|
+
|
|
187
|
+
key_worker_available = self.key_worker_available(id)
|
|
188
|
+
current_available = self.client.get(key_worker_available)
|
|
189
|
+
if current_available is None:
|
|
190
|
+
current_available = self.config.workers - 1
|
|
191
|
+
self.set_value(key_worker_available, current_available, to)
|
|
192
|
+
else:
|
|
193
|
+
self.set_value(
|
|
194
|
+
key_worker_available,
|
|
195
|
+
max(0, int(current_available) - 1), # type: ignore
|
|
196
|
+
to,
|
|
197
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from ..classes.settings import ContainerSettings
|
|
4
|
+
from .logger import Logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@typing.runtime_checkable
|
|
8
|
+
class UploadClient(typing.Protocol):
|
|
9
|
+
def get_object(self, url: str) -> typing.Optional[bytes]: ...
|
|
10
|
+
|
|
11
|
+
def put_object(
|
|
12
|
+
self,
|
|
13
|
+
bucket: str,
|
|
14
|
+
key: str,
|
|
15
|
+
data: bytes,
|
|
16
|
+
content_type: str = "application/octet-stream",
|
|
17
|
+
) -> None: ...
|
|
18
|
+
|
|
19
|
+
def put_json_stream(
|
|
20
|
+
self,
|
|
21
|
+
bucket: str,
|
|
22
|
+
key: str,
|
|
23
|
+
data: bytes,
|
|
24
|
+
content_type: str = "application/octet-stream",
|
|
25
|
+
) -> None: ...
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Upload:
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
settings: ContainerSettings,
|
|
32
|
+
logger: Logger,
|
|
33
|
+
) -> None:
|
|
34
|
+
self.client: UploadClient
|
|
35
|
+
self.settings = settings
|
|
36
|
+
self.logger = logger
|
|
37
|
+
|
|
38
|
+
if self.settings.upload.type == "minio":
|
|
39
|
+
from .upload_minio import MinIOClient
|
|
40
|
+
|
|
41
|
+
self.client = MinIOClient(self.settings, self.logger)
|
|
42
|
+
elif self.settings.upload.type == "s3":
|
|
43
|
+
from .upload_s3 import S3Client
|
|
44
|
+
|
|
45
|
+
self.client = S3Client(self.settings, self.logger)
|
|
46
|
+
else:
|
|
47
|
+
raise Exception(f"unsupported upload.type [{self.settings.upload.type}]")
|
|
48
|
+
|
|
49
|
+
def get_file(self, url: str) -> bytes:
|
|
50
|
+
return bytes()
|
|
51
|
+
|
|
52
|
+
def get_object(self, url: str) -> typing.Optional[bytes]:
|
|
53
|
+
self.client.get_object(url)
|
|
54
|
+
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
def put_object(
|
|
58
|
+
self,
|
|
59
|
+
bucket: str,
|
|
60
|
+
key: str,
|
|
61
|
+
data: bytes,
|
|
62
|
+
content_type: str = "application/octet-stream",
|
|
63
|
+
) -> None:
|
|
64
|
+
self.client.put_object(bucket, key, data, content_type)
|
|
65
|
+
|
|
66
|
+
def put_json_stream(
|
|
67
|
+
self,
|
|
68
|
+
bucket: str,
|
|
69
|
+
key: str,
|
|
70
|
+
data: bytes,
|
|
71
|
+
content_type: str = "application/octet-stream",
|
|
72
|
+
) -> None:
|
|
73
|
+
self.client.put_json_stream(bucket, key, data, content_type)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from ..classes.settings import ContainerSettings
|
|
4
|
+
from .logger import Logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MinIOClient:
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
settings: ContainerSettings,
|
|
11
|
+
logger: Logger,
|
|
12
|
+
) -> None:
|
|
13
|
+
self.settings = settings
|
|
14
|
+
self.client = None
|
|
15
|
+
self.logger = logger
|
|
16
|
+
if self.settings.upload.type == "minio":
|
|
17
|
+
import json
|
|
18
|
+
from minio import Minio
|
|
19
|
+
|
|
20
|
+
self.client = Minio(
|
|
21
|
+
self.settings.upload.base_domain,
|
|
22
|
+
access_key=self.settings.upload.get_key(),
|
|
23
|
+
secret_key=self.settings.upload.get_secret(),
|
|
24
|
+
region=self.settings.upload.get_region(),
|
|
25
|
+
session_token=self.settings.upload.get_token(),
|
|
26
|
+
secure=self.settings.upload.ssl,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if not self.client.bucket_exists(self.settings.upload.bucket):
|
|
30
|
+
try:
|
|
31
|
+
self.client.make_bucket(self.settings.upload.bucket)
|
|
32
|
+
self.logger.info_msg(
|
|
33
|
+
f"Bucket '{self.settings.upload.bucket}' created."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
self.client.set_bucket_policy(
|
|
37
|
+
self.settings.upload.bucket,
|
|
38
|
+
json.dumps(
|
|
39
|
+
{
|
|
40
|
+
"Version": "2012-10-17",
|
|
41
|
+
"Statement": [
|
|
42
|
+
{
|
|
43
|
+
"Effect": "Allow",
|
|
44
|
+
"Principal": {"AWS": ["*"]},
|
|
45
|
+
"Action": ["s3:GetObject"],
|
|
46
|
+
"Resource": [
|
|
47
|
+
f"arn:aws:s3:::{self.settings.upload.bucket}/*"
|
|
48
|
+
],
|
|
49
|
+
}
|
|
50
|
+
],
|
|
51
|
+
}
|
|
52
|
+
),
|
|
53
|
+
)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
self.logger.warning_msg(str(e))
|
|
56
|
+
self.logger.warning_msg(
|
|
57
|
+
f"error creating bucket [{self.settings.upload.bucket}]"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def get_object(self, url: str) -> typing.Optional[bytes]:
|
|
61
|
+
if not self.client:
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
from minio.error import S3Error
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
minio_uri_parts = url.replace("s3://", "").split("/")
|
|
68
|
+
bucket_name = minio_uri_parts[0]
|
|
69
|
+
object_name = "/".join(minio_uri_parts[1:])
|
|
70
|
+
|
|
71
|
+
response = self.client.get_object(bucket_name, object_name)
|
|
72
|
+
|
|
73
|
+
return response.read()
|
|
74
|
+
except S3Error as e:
|
|
75
|
+
self.logger.error_msg(f"Failed to get object from {url}: {str(e)}")
|
|
76
|
+
raise
|
|
77
|
+
|
|
78
|
+
def put_object(
|
|
79
|
+
self,
|
|
80
|
+
bucket: str,
|
|
81
|
+
key: str,
|
|
82
|
+
data: bytes,
|
|
83
|
+
content_type: str = "application/octet-stream",
|
|
84
|
+
) -> None:
|
|
85
|
+
if not self.client:
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
import io
|
|
89
|
+
|
|
90
|
+
from minio.error import S3Error
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
if isinstance(data, str):
|
|
94
|
+
data = data.encode("utf-8")
|
|
95
|
+
|
|
96
|
+
self.client.put_object(
|
|
97
|
+
bucket_name=bucket,
|
|
98
|
+
object_name=key,
|
|
99
|
+
data=io.BytesIO(data),
|
|
100
|
+
length=len(data),
|
|
101
|
+
content_type=content_type,
|
|
102
|
+
)
|
|
103
|
+
except S3Error as e:
|
|
104
|
+
self.logger.error_msg(f"Failed to put object in {bucket}/{key}: {str(e)}")
|
|
105
|
+
raise
|
|
106
|
+
|
|
107
|
+
def put_json_stream(
|
|
108
|
+
self,
|
|
109
|
+
bucket: str,
|
|
110
|
+
key: str,
|
|
111
|
+
data: bytes,
|
|
112
|
+
content_type: str = "application/octet-stream",
|
|
113
|
+
) -> None:
|
|
114
|
+
if not self.client:
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
self.put_object(
|
|
118
|
+
bucket,
|
|
119
|
+
key,
|
|
120
|
+
data,
|
|
121
|
+
content_type,
|
|
122
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from ..classes.settings import ContainerSettings
|
|
4
|
+
from .logger import Logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class S3Client:
|
|
8
|
+
def __init__(self, settings: ContainerSettings, logger: Logger) -> None:
|
|
9
|
+
self.settings = settings
|
|
10
|
+
self.client = None
|
|
11
|
+
self.logger = logger
|
|
12
|
+
if self.settings.upload.type == "s3":
|
|
13
|
+
import boto3, certifi
|
|
14
|
+
from botocore.config import Config
|
|
15
|
+
|
|
16
|
+
self.client = boto3.client( # pyright: ignore[reportUnknownMemberType]
|
|
17
|
+
"s3",
|
|
18
|
+
aws_access_key_id=self.settings.upload.get_key(),
|
|
19
|
+
aws_secret_access_key=self.settings.upload.get_secret(),
|
|
20
|
+
aws_session_token=self.settings.upload.get_token(),
|
|
21
|
+
config=Config(max_pool_connections=50),
|
|
22
|
+
region_name=self.settings.upload.get_region(),
|
|
23
|
+
verify=certifi.where(),
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def get_object(self, url: str) -> typing.Optional[bytes]:
|
|
27
|
+
if not self.client:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
s3_uri_parts = url.replace("s3://", "").split("/")
|
|
32
|
+
s3_bucket = s3_uri_parts[0]
|
|
33
|
+
s3_key = "/".join(s3_uri_parts[1:])
|
|
34
|
+
|
|
35
|
+
response = self.client.get_object(Bucket=s3_bucket, Key=s3_key)
|
|
36
|
+
|
|
37
|
+
return response["Body"].read()
|
|
38
|
+
except Exception as e:
|
|
39
|
+
self.logger.error_msg(f"[{url}] exception: {e}")
|
|
40
|
+
raise
|
|
41
|
+
|
|
42
|
+
def put_object(
|
|
43
|
+
self,
|
|
44
|
+
bucket: str,
|
|
45
|
+
key: str,
|
|
46
|
+
data: bytes,
|
|
47
|
+
content_type: str = "application/octet-stream",
|
|
48
|
+
) -> None:
|
|
49
|
+
if not self.client:
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
self.client.put_object(
|
|
53
|
+
Bucket=bucket,
|
|
54
|
+
Key=key,
|
|
55
|
+
Body=data,
|
|
56
|
+
ContentType=content_type,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def put_json_stream(
|
|
60
|
+
self,
|
|
61
|
+
bucket: str,
|
|
62
|
+
key: str,
|
|
63
|
+
data: bytes,
|
|
64
|
+
content_type: str = "application/octet-stream",
|
|
65
|
+
) -> None:
|
|
66
|
+
if not self.client:
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
import io
|
|
70
|
+
|
|
71
|
+
json_stream = io.BytesIO()
|
|
72
|
+
|
|
73
|
+
if isinstance(data, str):
|
|
74
|
+
data = data.encode("utf-8")
|
|
75
|
+
|
|
76
|
+
json_stream.write(data)
|
|
77
|
+
json_stream.seek(0)
|
|
78
|
+
|
|
79
|
+
self.put_object(
|
|
80
|
+
bucket,
|
|
81
|
+
key,
|
|
82
|
+
json_stream.getvalue(),
|
|
83
|
+
content_type,
|
|
84
|
+
)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_config_path() -> typing.Optional[str]:
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
if "-c" in sys.argv:
|
|
8
|
+
config_index = sys.argv.index("-c") + 1
|
|
9
|
+
if config_index < len(sys.argv):
|
|
10
|
+
return sys.argv[config_index]
|
|
11
|
+
|
|
12
|
+
return None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_gunicorn_threads() -> int:
|
|
16
|
+
import importlib.util
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
conf_path = get_config_path()
|
|
20
|
+
if conf_path is not None:
|
|
21
|
+
spec = importlib.util.spec_from_file_location("gunicorn_conf", conf_path)
|
|
22
|
+
if spec and spec.loader:
|
|
23
|
+
gunicorn_conf = importlib.util.module_from_spec(spec)
|
|
24
|
+
spec.loader.exec_module(gunicorn_conf)
|
|
25
|
+
return gunicorn_conf.threads
|
|
26
|
+
return 0
|
|
27
|
+
return 1
|
|
28
|
+
except Exception:
|
|
29
|
+
return 1
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_thread_id(
|
|
33
|
+
thread_ids: typing.Dict[str, str],
|
|
34
|
+
) -> typing.Tuple[str, typing.Dict[str, str]]:
|
|
35
|
+
import secrets, threading
|
|
36
|
+
|
|
37
|
+
thread_name = threading.current_thread().name
|
|
38
|
+
if thread_name not in thread_ids:
|
|
39
|
+
thread_ids[thread_name] = secrets.token_hex(4)
|
|
40
|
+
return thread_ids[thread_name], thread_ids
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_worker_id() -> str:
|
|
44
|
+
import os
|
|
45
|
+
|
|
46
|
+
from multiprocessing import current_process
|
|
47
|
+
|
|
48
|
+
name = os.environ.get("HOSTNAME")
|
|
49
|
+
if name is None or name == "":
|
|
50
|
+
return str(current_process().pid)
|
|
51
|
+
|
|
52
|
+
return name
|