castor-extractor 0.16.15__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +4 -0
- castor_extractor/commands/upload.py +4 -4
- castor_extractor/uploader/constant.py +8 -1
- castor_extractor/uploader/upload.py +63 -46
- castor_extractor/uploader/upload_test.py +4 -3
- castor_extractor/uploader/utils.py +10 -0
- castor_extractor/warehouse/abstract/asset.py +1 -0
- {castor_extractor-0.16.15.dist-info → castor_extractor-0.17.0.dist-info}/METADATA +1 -1
- {castor_extractor-0.16.15.dist-info → castor_extractor-0.17.0.dist-info}/RECORD +12 -12
- {castor_extractor-0.16.15.dist-info → castor_extractor-0.17.0.dist-info}/LICENCE +0 -0
- {castor_extractor-0.16.15.dist-info → castor_extractor-0.17.0.dist-info}/WHEEL +0 -0
- {castor_extractor-0.16.15.dist-info → castor_extractor-0.17.0.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -13,10 +13,10 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
|
|
|
13
13
|
def _args():
|
|
14
14
|
parser = argparse.ArgumentParser()
|
|
15
15
|
parser.add_argument(
|
|
16
|
-
"-
|
|
17
|
-
"--
|
|
16
|
+
"-k",
|
|
17
|
+
"--token",
|
|
18
18
|
required=True,
|
|
19
|
-
help="""
|
|
19
|
+
help="""API token provided by Castor""",
|
|
20
20
|
)
|
|
21
21
|
parser.add_argument(
|
|
22
22
|
"-s",
|
|
@@ -44,7 +44,7 @@ def _args():
|
|
|
44
44
|
)
|
|
45
45
|
parsed = parser.parse_args()
|
|
46
46
|
return {
|
|
47
|
-
"
|
|
47
|
+
"token": parsed.token,
|
|
48
48
|
"source_id": parsed.source_id,
|
|
49
49
|
"file_path": parsed.file_path,
|
|
50
50
|
"directory_path": parsed.directory_path,
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
from ..utils import RetryStrategy
|
|
4
|
+
|
|
5
|
+
# url of the gcs proxy
|
|
6
|
+
INGEST_URL = "https://ingest.castordoc.com"
|
|
7
|
+
|
|
8
|
+
RETRY_BASE_MS = 10_000
|
|
9
|
+
RETRY_JITTER_MS = 1_000
|
|
10
|
+
RETRY_STRATEGY = RetryStrategy.LINEAR
|
|
4
11
|
|
|
5
12
|
|
|
6
13
|
class FileType(Enum):
|
|
@@ -1,83 +1,100 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
import json
|
|
3
2
|
import logging
|
|
4
3
|
import ntpath
|
|
5
4
|
from datetime import datetime
|
|
6
|
-
from typing import Iterable, Optional,
|
|
5
|
+
from typing import Dict, Iterable, Optional, Tuple
|
|
7
6
|
from uuid import UUID
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
from .
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
from ..utils.retry import retry
|
|
11
|
+
from .constant import (
|
|
12
|
+
INGEST_URL,
|
|
13
|
+
PATH_TEMPLATES,
|
|
14
|
+
RETRY_BASE_MS,
|
|
15
|
+
RETRY_JITTER_MS,
|
|
16
|
+
RETRY_STRATEGY,
|
|
17
|
+
FileType,
|
|
18
|
+
)
|
|
12
19
|
from .env import get_blob_env
|
|
13
|
-
from .utils import
|
|
20
|
+
from .utils import iter_files
|
|
14
21
|
|
|
15
22
|
logger = logging.getLogger(__name__)
|
|
16
23
|
|
|
24
|
+
_EXCEPTIONS = (
|
|
25
|
+
requests.exceptions.Timeout,
|
|
26
|
+
requests.exceptions.ConnectTimeout,
|
|
27
|
+
)
|
|
17
28
|
|
|
18
|
-
def _client(credentials: Union[str, dict]) -> storage.Client:
|
|
19
|
-
"""supports dict, string or path to the JSON file"""
|
|
20
|
-
if isinstance(credentials, dict):
|
|
21
|
-
return storage.Client.from_service_account_info(credentials)
|
|
22
|
-
if file_exist(credentials):
|
|
23
|
-
return storage.Client.from_service_account_json(credentials)
|
|
24
|
-
if isinstance(credentials, str):
|
|
25
|
-
credentials = json.loads(credentials)
|
|
26
|
-
return storage.Client.from_service_account_info(credentials)
|
|
27
|
-
raise ValueError("needs path or dict for credentials")
|
|
28
29
|
|
|
30
|
+
def _path_and_url(
|
|
31
|
+
source_id: UUID,
|
|
32
|
+
file_type: FileType,
|
|
33
|
+
file_path: str,
|
|
34
|
+
) -> Tuple[str, str]:
|
|
29
35
|
|
|
30
|
-
def _path(source_id: UUID, file_type: FileType, file_path: str) -> str:
|
|
31
36
|
now = datetime.utcnow()
|
|
32
37
|
timestamp = int(now.timestamp())
|
|
33
38
|
filename = ntpath.basename(file_path)
|
|
34
|
-
|
|
35
39
|
path_template = PATH_TEMPLATES[file_type]
|
|
36
|
-
|
|
40
|
+
path = path_template.format(
|
|
37
41
|
timestamp=timestamp,
|
|
38
42
|
source_id=source_id,
|
|
39
43
|
filename=filename,
|
|
40
44
|
)
|
|
41
45
|
|
|
46
|
+
url = f"{INGEST_URL}/{path}"
|
|
42
47
|
|
|
43
|
-
|
|
44
|
-
credentials: Union[str, dict],
|
|
45
|
-
source_id: UUID,
|
|
46
|
-
file_path: str,
|
|
47
|
-
file_type: FileType,
|
|
48
|
-
) -> storage.Blob:
|
|
49
|
-
"""get the target blob to upload to"""
|
|
50
|
-
client = _client(credentials)
|
|
51
|
-
path = _path(source_id, file_type, file_path)
|
|
48
|
+
return path, url
|
|
52
49
|
|
|
53
|
-
|
|
54
|
-
|
|
50
|
+
|
|
51
|
+
def _headers(token: str) -> Dict:
|
|
52
|
+
return {
|
|
53
|
+
"Authorization": f"Token {token}",
|
|
54
|
+
"Accept": "text/csv, application/json",
|
|
55
|
+
}
|
|
55
56
|
|
|
56
57
|
|
|
57
58
|
def _upload(
|
|
58
|
-
|
|
59
|
+
token: str,
|
|
59
60
|
source_id: UUID,
|
|
60
61
|
file_path: str,
|
|
61
62
|
file_type: FileType,
|
|
62
63
|
) -> None:
|
|
63
64
|
"""
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
file_path: path to the local file to upload
|
|
65
|
+
Upload the given file to Google Cloud Storage (GCS)
|
|
66
|
+
- Don't call GCS API directly
|
|
67
|
+
- Call the ingestion proxy which handles authorisation and uploading
|
|
68
68
|
"""
|
|
69
|
-
|
|
69
|
+
path, url = _path_and_url(source_id, file_type, file_path)
|
|
70
|
+
headers = _headers(token)
|
|
71
|
+
timeout, max_retries = get_blob_env()
|
|
70
72
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
73
|
+
with open(file_path, "rb") as file_content:
|
|
74
|
+
|
|
75
|
+
@retry(
|
|
76
|
+
exceptions=_EXCEPTIONS,
|
|
77
|
+
max_retries=max_retries,
|
|
78
|
+
base_ms=RETRY_BASE_MS,
|
|
79
|
+
jitter_ms=RETRY_JITTER_MS,
|
|
80
|
+
strategy=RETRY_STRATEGY,
|
|
81
|
+
)
|
|
82
|
+
def _request_post():
|
|
83
|
+
response = requests.post(
|
|
84
|
+
url=url,
|
|
85
|
+
headers=headers,
|
|
86
|
+
data=file_content,
|
|
87
|
+
timeout=timeout,
|
|
88
|
+
)
|
|
89
|
+
response.raise_for_status()
|
|
90
|
+
|
|
91
|
+
_request_post()
|
|
92
|
+
|
|
93
|
+
logger.info(f"Uploaded {file_path} as {file_type.value} to {path}")
|
|
77
94
|
|
|
78
95
|
|
|
79
96
|
def upload_manifest(
|
|
80
|
-
|
|
97
|
+
token: str,
|
|
81
98
|
source_id: UUID,
|
|
82
99
|
file_path: str,
|
|
83
100
|
) -> None:
|
|
@@ -86,11 +103,11 @@ def upload_manifest(
|
|
|
86
103
|
source_id: id for the source
|
|
87
104
|
file_path: path to the local manifest to upload
|
|
88
105
|
"""
|
|
89
|
-
_upload(
|
|
106
|
+
_upload(token, source_id, file_path, FileType.DBT)
|
|
90
107
|
|
|
91
108
|
|
|
92
109
|
def upload(
|
|
93
|
-
|
|
110
|
+
token: str,
|
|
94
111
|
source_id: UUID,
|
|
95
112
|
file_type: FileType,
|
|
96
113
|
file_path: Optional[str] = None,
|
|
@@ -113,4 +130,4 @@ def upload(
|
|
|
113
130
|
raise ValueError(message)
|
|
114
131
|
|
|
115
132
|
for file_ in files:
|
|
116
|
-
_upload(
|
|
133
|
+
_upload(token, source_id, file_, file_type)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
2
|
|
|
3
|
-
from .constant import FileType
|
|
4
|
-
from .upload import
|
|
3
|
+
from .constant import INGEST_URL, FileType
|
|
4
|
+
from .upload import _path_and_url
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def test__path():
|
|
@@ -9,5 +9,6 @@ def test__path():
|
|
|
9
9
|
file_type = FileType.VIZ
|
|
10
10
|
file_path = "filename"
|
|
11
11
|
|
|
12
|
-
path =
|
|
12
|
+
path, url = _path_and_url(source_id, file_type, file_path)
|
|
13
13
|
assert path == f"visualization-{source_id}/{file_path}"
|
|
14
|
+
assert url == f"{INGEST_URL}/{path}"
|
|
@@ -1,13 +1,23 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import os
|
|
2
3
|
from typing import Iterator
|
|
3
4
|
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
_ALLOWED_EXTENSION = (".json", ".csv")
|
|
8
|
+
|
|
4
9
|
|
|
5
10
|
def iter_files(repository_path: str) -> Iterator[str]:
|
|
6
11
|
"""
|
|
7
12
|
Given a repository path yield all files in that repository
|
|
13
|
+
Removes file whose extension is not allowed
|
|
8
14
|
"""
|
|
9
15
|
|
|
10
16
|
for file in os.listdir(repository_path):
|
|
17
|
+
_, ext = os.path.splitext(file)
|
|
18
|
+
if ext not in _ALLOWED_EXTENSION:
|
|
19
|
+
logger.info(f"Forbidden file extension : skipping {file}")
|
|
20
|
+
continue
|
|
11
21
|
file_path = os.path.join(repository_path, file)
|
|
12
22
|
|
|
13
23
|
if os.path.isfile(file_path):
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=EVZ9vhIVN7HLn5PYkRyBWyT3hk72Nt3i1SghwSipfR4,10957
|
|
2
2
|
Dockerfile,sha256=HcX5z8OpeSvkScQsN-Y7CNMUig_UB6vTMDl7uqzuLGE,303
|
|
3
3
|
LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
4
4
|
README.md,sha256=uF6PXm9ocPITlKVSh9afTakHmpLx3TvawLf-CbMP3wM,3578
|
|
@@ -23,7 +23,7 @@ castor_extractor/commands/extract_snowflake.py,sha256=vYiruxRoo--GeMemOGsSE1w9kc
|
|
|
23
23
|
castor_extractor/commands/extract_sqlserver.py,sha256=lwhbcNChaXHZgMgSOch3faVr7WJw-sDU6GHl3lzBt_0,1141
|
|
24
24
|
castor_extractor/commands/extract_tableau.py,sha256=u-6UCd-kfXwyhNWYxZusqtgTTYkf4gAJS1vRIYWsAVU,1415
|
|
25
25
|
castor_extractor/commands/file_check.py,sha256=PkXFK-kPoG8PpvBP-PCvVdreqwYw6Y1fTh2QzDxACsM,2684
|
|
26
|
-
castor_extractor/commands/upload.py,sha256=
|
|
26
|
+
castor_extractor/commands/upload.py,sha256=tAyHfIDOdUpD0yMJe2f64nXpaVnIbgYXi4bYx2nNvNU,1922
|
|
27
27
|
castor_extractor/file_checker/__init__.py,sha256=OSt6YLhUT42U_Cp3LCLHMVruwDkksL75Ij13X2UPnVk,119
|
|
28
28
|
castor_extractor/file_checker/column.py,sha256=fMchy5v-Sd-0xuYS0V9mob7wnljslzWLhQGqrKGybdk,3097
|
|
29
29
|
castor_extractor/file_checker/column_test.py,sha256=1j8PxvmvmJgpd-mk30iMYOme32ovPSIn4yCXywFoXrg,1935
|
|
@@ -38,12 +38,12 @@ castor_extractor/file_checker/templates/generic_warehouse.py,sha256=zvnWnYB8FNvh
|
|
|
38
38
|
castor_extractor/logger.py,sha256=ovf1mBEKwbJBskBXoqHbcAomBrp58mUwSrCWtEMlYPM,1197
|
|
39
39
|
castor_extractor/types.py,sha256=-QgiOaq--nXUsYLy_oESDrYbRMxs353-YiQnG1blJvU,1303
|
|
40
40
|
castor_extractor/uploader/__init__.py,sha256=SSRtwjg-dNoxME-RJy9G1flASiUKAC5bH1htq3CURQg,75
|
|
41
|
-
castor_extractor/uploader/constant.py,sha256=
|
|
41
|
+
castor_extractor/uploader/constant.py,sha256=yTigLHDlYwoRr6CpFIl7ReElFsQd4H-qkluMZJPWSx0,865
|
|
42
42
|
castor_extractor/uploader/env.py,sha256=5HSniVSOYVg4u38O4k8TB_qaJq9s8yJ1hjedkq_gdVg,878
|
|
43
43
|
castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
|
|
44
|
-
castor_extractor/uploader/upload.py,sha256=
|
|
45
|
-
castor_extractor/uploader/upload_test.py,sha256=
|
|
46
|
-
castor_extractor/uploader/utils.py,sha256=
|
|
44
|
+
castor_extractor/uploader/upload.py,sha256=bTWD1_-hmJ6q1qcEosjZ96wsBtWDnWoCt692NYX_Nko,3228
|
|
45
|
+
castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
|
|
46
|
+
castor_extractor/uploader/utils.py,sha256=Tx_i875L2vJ8btOLV3-L0UMEFiyhH8E5n0XXRyLjO0Y,793
|
|
47
47
|
castor_extractor/utils/__init__.py,sha256=bmzAOc-PKsVreMJtF7DGpPQeHrVqxWel_BblRftt6Ag,1186
|
|
48
48
|
castor_extractor/utils/client/__init__.py,sha256=CRE-xJKm6fVV9dB8ljzB5YoOxX4I1sCD1KSgqs3Y8_Y,161
|
|
49
49
|
castor_extractor/utils/client/abstract.py,sha256=aA5Qcb9TwWDSMq8WpXbGkOB20hehwX2VTpqQAwV76wk,2048
|
|
@@ -253,7 +253,7 @@ castor_extractor/visualization/tableau_revamp/constants.py,sha256=PcdudAogQhi3e-
|
|
|
253
253
|
castor_extractor/visualization/tableau_revamp/extract.py,sha256=2SLUxp5okM4AcEJJ61ZgcC2ikfZZl9MH17CEXMXmgl0,1450
|
|
254
254
|
castor_extractor/warehouse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
255
255
|
castor_extractor/warehouse/abstract/__init__.py,sha256=Fdfa026tgOo64MvzVRLHM_F2G-JmcehrF0mh3dHgb7s,419
|
|
256
|
-
castor_extractor/warehouse/abstract/asset.py,sha256=
|
|
256
|
+
castor_extractor/warehouse/abstract/asset.py,sha256=Qs7T2Iw7KHgWVT2aAoBfCQ8tB143cUZY-DRUSkpgvGU,2689
|
|
257
257
|
castor_extractor/warehouse/abstract/asset_test.py,sha256=_kd4ybNlWSAdSdEgJKC-jhJTa1nMRa9i8RO3YbqKLM4,758
|
|
258
258
|
castor_extractor/warehouse/abstract/extract.py,sha256=fVBhdE-yMI_g6RBYZcr7q-ZVW7jK7WVkO_GO_KfkRqg,2908
|
|
259
259
|
castor_extractor/warehouse/abstract/query.py,sha256=GAgeISCmAdrkTKzFGO79hQDf6SA6EFrrlW43w-LiXKo,2632
|
|
@@ -370,8 +370,8 @@ castor_extractor/warehouse/synapse/queries/schema.sql,sha256=aX9xNrBD_ydwl-znGSF
|
|
|
370
370
|
castor_extractor/warehouse/synapse/queries/table.sql,sha256=mCE8bR1Vb7j7SwZW2gafcXidQ2fo1HwxcybA8wP2Kfs,1049
|
|
371
371
|
castor_extractor/warehouse/synapse/queries/user.sql,sha256=sTb_SS7Zj3AXW1SggKPLNMCd0qoTpL7XI_BJRMaEpBg,67
|
|
372
372
|
castor_extractor/warehouse/synapse/queries/view_ddl.sql,sha256=3EVbp5_yTgdByHFIPLHmnoOnqqLE77SrjAwFDvu4e54,249
|
|
373
|
-
castor_extractor-0.
|
|
374
|
-
castor_extractor-0.
|
|
375
|
-
castor_extractor-0.
|
|
376
|
-
castor_extractor-0.
|
|
377
|
-
castor_extractor-0.
|
|
373
|
+
castor_extractor-0.17.0.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
374
|
+
castor_extractor-0.17.0.dist-info/METADATA,sha256=mPiUyxCqXFifcPbhcOPFsnkPAV4OcWXoYzGeUKlbkoo,6582
|
|
375
|
+
castor_extractor-0.17.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
376
|
+
castor_extractor-0.17.0.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
|
|
377
|
+
castor_extractor-0.17.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|