castor-extractor 0.16.15__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.17.0 - 2024-06-10
4
+
5
+ * Uploader: redirect to the proxy, replace credentials with token
6
+
3
7
  ## 0.16.15 - 2024-06-07
4
8
 
5
9
  * Tableau: extract database_name for CustomSQLTables
@@ -13,10 +13,10 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
13
13
  def _args():
14
14
  parser = argparse.ArgumentParser()
15
15
  parser.add_argument(
16
- "-c",
17
- "--credentials",
16
+ "-k",
17
+ "--token",
18
18
  required=True,
19
- help="""Path to credentials or credentials as string""",
19
+ help="""API token provided by Castor""",
20
20
  )
21
21
  parser.add_argument(
22
22
  "-s",
@@ -44,7 +44,7 @@ def _args():
44
44
  )
45
45
  parsed = parser.parse_args()
46
46
  return {
47
- "credentials": parsed.credentials,
47
+ "token": parsed.token,
48
48
  "source_id": parsed.source_id,
49
49
  "file_path": parsed.file_path,
50
50
  "directory_path": parsed.directory_path,
@@ -1,6 +1,13 @@
1
1
  from enum import Enum
2
2
 
3
- EXTRACTION_BUCKET = "extraction-storage"
3
+ from ..utils import RetryStrategy
4
+
5
+ # url of the gcs proxy
6
+ INGEST_URL = "https://ingest.castordoc.com"
7
+
8
+ RETRY_BASE_MS = 10_000
9
+ RETRY_JITTER_MS = 1_000
10
+ RETRY_STRATEGY = RetryStrategy.LINEAR
4
11
 
5
12
 
6
13
  class FileType(Enum):
@@ -1,83 +1,100 @@
1
1
  #!/usr/bin/env python3
2
- import json
3
2
  import logging
4
3
  import ntpath
5
4
  from datetime import datetime
6
- from typing import Iterable, Optional, Union
5
+ from typing import Dict, Iterable, Optional, Tuple
7
6
  from uuid import UUID
8
7
 
9
- from google.cloud import storage # type: ignore
10
-
11
- from .constant import EXTRACTION_BUCKET, PATH_TEMPLATES, FileType
8
+ import requests
9
+
10
+ from ..utils.retry import retry
11
+ from .constant import (
12
+ INGEST_URL,
13
+ PATH_TEMPLATES,
14
+ RETRY_BASE_MS,
15
+ RETRY_JITTER_MS,
16
+ RETRY_STRATEGY,
17
+ FileType,
18
+ )
12
19
  from .env import get_blob_env
13
- from .utils import file_exist, iter_files
20
+ from .utils import iter_files
14
21
 
15
22
  logger = logging.getLogger(__name__)
16
23
 
24
+ _EXCEPTIONS = (
25
+ requests.exceptions.Timeout,
26
+ requests.exceptions.ConnectTimeout,
27
+ )
17
28
 
18
- def _client(credentials: Union[str, dict]) -> storage.Client:
19
- """supports dict, string or path to the JSON file"""
20
- if isinstance(credentials, dict):
21
- return storage.Client.from_service_account_info(credentials)
22
- if file_exist(credentials):
23
- return storage.Client.from_service_account_json(credentials)
24
- if isinstance(credentials, str):
25
- credentials = json.loads(credentials)
26
- return storage.Client.from_service_account_info(credentials)
27
- raise ValueError("needs path or dict for credentials")
28
29
 
30
+ def _path_and_url(
31
+ source_id: UUID,
32
+ file_type: FileType,
33
+ file_path: str,
34
+ ) -> Tuple[str, str]:
29
35
 
30
- def _path(source_id: UUID, file_type: FileType, file_path: str) -> str:
31
36
  now = datetime.utcnow()
32
37
  timestamp = int(now.timestamp())
33
38
  filename = ntpath.basename(file_path)
34
-
35
39
  path_template = PATH_TEMPLATES[file_type]
36
- return path_template.format(
40
+ path = path_template.format(
37
41
  timestamp=timestamp,
38
42
  source_id=source_id,
39
43
  filename=filename,
40
44
  )
41
45
 
46
+ url = f"{INGEST_URL}/{path}"
42
47
 
43
- def _get_blob(
44
- credentials: Union[str, dict],
45
- source_id: UUID,
46
- file_path: str,
47
- file_type: FileType,
48
- ) -> storage.Blob:
49
- """get the target blob to upload to"""
50
- client = _client(credentials)
51
- path = _path(source_id, file_type, file_path)
48
+ return path, url
52
49
 
53
- bucket = client.bucket(EXTRACTION_BUCKET)
54
- return bucket.blob(path)
50
+
51
+ def _headers(token: str) -> Dict:
52
+ return {
53
+ "Authorization": f"Token {token}",
54
+ "Accept": "text/csv, application/json",
55
+ }
55
56
 
56
57
 
57
58
  def _upload(
58
- credentials: Union[str, dict],
59
+ token: str,
59
60
  source_id: UUID,
60
61
  file_path: str,
61
62
  file_type: FileType,
62
63
  ) -> None:
63
64
  """
64
- credentials: path to file or dict
65
- source_id: id for the source
66
- file_type: type of file to upload
67
- file_path: path to the local file to upload
65
+ Upload the given file to Google Cloud Storage (GCS)
66
+ - Don't call GCS API directly
67
+ - Call the ingestion proxy which handles authorisation and uploading
68
68
  """
69
- timeout, retries = get_blob_env()
69
+ path, url = _path_and_url(source_id, file_type, file_path)
70
+ headers = _headers(token)
71
+ timeout, max_retries = get_blob_env()
70
72
 
71
- blob = _get_blob(credentials, source_id, file_path, file_type)
72
- with open(file_path, "rb") as f:
73
- blob.upload_from_file(f, timeout=timeout, num_retries=retries)
74
- logger.info(
75
- f"uploaded {file_path} as {file_type.value} to {blob.public_url}",
76
- )
73
+ with open(file_path, "rb") as file_content:
74
+
75
+ @retry(
76
+ exceptions=_EXCEPTIONS,
77
+ max_retries=max_retries,
78
+ base_ms=RETRY_BASE_MS,
79
+ jitter_ms=RETRY_JITTER_MS,
80
+ strategy=RETRY_STRATEGY,
81
+ )
82
+ def _request_post():
83
+ response = requests.post(
84
+ url=url,
85
+ headers=headers,
86
+ data=file_content,
87
+ timeout=timeout,
88
+ )
89
+ response.raise_for_status()
90
+
91
+ _request_post()
92
+
93
+ logger.info(f"Uploaded {file_path} as {file_type.value} to {path}")
77
94
 
78
95
 
79
96
  def upload_manifest(
80
- credentials: Union[str, dict],
97
+ token: str,
81
98
  source_id: UUID,
82
99
  file_path: str,
83
100
  ) -> None:
@@ -86,11 +103,11 @@ def upload_manifest(
86
103
  source_id: id for the source
87
104
  file_path: path to the local manifest to upload
88
105
  """
89
- _upload(credentials, source_id, file_path, FileType.DBT)
106
+ _upload(token, source_id, file_path, FileType.DBT)
90
107
 
91
108
 
92
109
  def upload(
93
- credentials: Union[str, dict],
110
+ token: str,
94
111
  source_id: UUID,
95
112
  file_type: FileType,
96
113
  file_path: Optional[str] = None,
@@ -113,4 +130,4 @@ def upload(
113
130
  raise ValueError(message)
114
131
 
115
132
  for file_ in files:
116
- _upload(credentials, source_id, file_, file_type)
133
+ _upload(token, source_id, file_, file_type)
@@ -1,7 +1,7 @@
1
1
  from uuid import UUID
2
2
 
3
- from .constant import FileType
4
- from .upload import _path
3
+ from .constant import INGEST_URL, FileType
4
+ from .upload import _path_and_url
5
5
 
6
6
 
7
7
  def test__path():
@@ -9,5 +9,6 @@ def test__path():
9
9
  file_type = FileType.VIZ
10
10
  file_path = "filename"
11
11
 
12
- path = _path(source_id, file_type, file_path)
12
+ path, url = _path_and_url(source_id, file_type, file_path)
13
13
  assert path == f"visualization-{source_id}/{file_path}"
14
+ assert url == f"{INGEST_URL}/{path}"
@@ -1,13 +1,23 @@
1
+ import logging
1
2
  import os
2
3
  from typing import Iterator
3
4
 
5
+ logger = logging.getLogger(__name__)
6
+
7
+ _ALLOWED_EXTENSION = (".json", ".csv")
8
+
4
9
 
5
10
  def iter_files(repository_path: str) -> Iterator[str]:
6
11
  """
7
12
  Given a repository path yield all files in that repository
13
+ Removes file whose extension is not allowed
8
14
  """
9
15
 
10
16
  for file in os.listdir(repository_path):
17
+ _, ext = os.path.splitext(file)
18
+ if ext not in _ALLOWED_EXTENSION:
19
+ logger.info(f"Forbidden file extension : skipping {file}")
20
+ continue
11
21
  file_path = os.path.join(repository_path, file)
12
22
 
13
23
  if os.path.isfile(file_path):
@@ -32,6 +32,7 @@ class WarehouseAsset(ExternalAsset):
32
32
  WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
33
33
  WarehouseAsset.EXTERNAL_COLUMN_LINEAGE,
34
34
  WarehouseAsset.EXTERNAL_TABLE_LINEAGE,
35
+ WarehouseAsset.FUNCTION,
35
36
  }
36
37
 
37
38
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.16.15
3
+ Version: 0.17.0
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=QYFobUPMbdi6cidq_yU-oMbXWoAr1BjTE6thfdZ9tA4,10866
1
+ CHANGELOG.md,sha256=EVZ9vhIVN7HLn5PYkRyBWyT3hk72Nt3i1SghwSipfR4,10957
2
2
  Dockerfile,sha256=HcX5z8OpeSvkScQsN-Y7CNMUig_UB6vTMDl7uqzuLGE,303
3
3
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
4
4
  README.md,sha256=uF6PXm9ocPITlKVSh9afTakHmpLx3TvawLf-CbMP3wM,3578
@@ -23,7 +23,7 @@ castor_extractor/commands/extract_snowflake.py,sha256=vYiruxRoo--GeMemOGsSE1w9kc
23
23
  castor_extractor/commands/extract_sqlserver.py,sha256=lwhbcNChaXHZgMgSOch3faVr7WJw-sDU6GHl3lzBt_0,1141
24
24
  castor_extractor/commands/extract_tableau.py,sha256=u-6UCd-kfXwyhNWYxZusqtgTTYkf4gAJS1vRIYWsAVU,1415
25
25
  castor_extractor/commands/file_check.py,sha256=PkXFK-kPoG8PpvBP-PCvVdreqwYw6Y1fTh2QzDxACsM,2684
26
- castor_extractor/commands/upload.py,sha256=tWN9hUn1aKJrGEmNHI_qjSciFiAoL9x7YolbIsYbg8Y,1956
26
+ castor_extractor/commands/upload.py,sha256=tAyHfIDOdUpD0yMJe2f64nXpaVnIbgYXi4bYx2nNvNU,1922
27
27
  castor_extractor/file_checker/__init__.py,sha256=OSt6YLhUT42U_Cp3LCLHMVruwDkksL75Ij13X2UPnVk,119
28
28
  castor_extractor/file_checker/column.py,sha256=fMchy5v-Sd-0xuYS0V9mob7wnljslzWLhQGqrKGybdk,3097
29
29
  castor_extractor/file_checker/column_test.py,sha256=1j8PxvmvmJgpd-mk30iMYOme32ovPSIn4yCXywFoXrg,1935
@@ -38,12 +38,12 @@ castor_extractor/file_checker/templates/generic_warehouse.py,sha256=zvnWnYB8FNvh
38
38
  castor_extractor/logger.py,sha256=ovf1mBEKwbJBskBXoqHbcAomBrp58mUwSrCWtEMlYPM,1197
39
39
  castor_extractor/types.py,sha256=-QgiOaq--nXUsYLy_oESDrYbRMxs353-YiQnG1blJvU,1303
40
40
  castor_extractor/uploader/__init__.py,sha256=SSRtwjg-dNoxME-RJy9G1flASiUKAC5bH1htq3CURQg,75
41
- castor_extractor/uploader/constant.py,sha256=hEJlWYx0dyBzgo59XUBKCYIKEODpIc2DyzwAZIiNO8g,718
41
+ castor_extractor/uploader/constant.py,sha256=yTigLHDlYwoRr6CpFIl7ReElFsQd4H-qkluMZJPWSx0,865
42
42
  castor_extractor/uploader/env.py,sha256=5HSniVSOYVg4u38O4k8TB_qaJq9s8yJ1hjedkq_gdVg,878
43
43
  castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
44
- castor_extractor/uploader/upload.py,sha256=5Aj3UOx8cpSVvzjYRz7S6nLk249IqUiCia70utU_970,3363
45
- castor_extractor/uploader/upload_test.py,sha256=BfGjAYEEDBmEcUS6_b3SlKyiQNR1iRf6-qmADDirTJI,328
46
- castor_extractor/uploader/utils.py,sha256=NCe0tkB28BVhqzOaDhDjaSfODjjcPWB17X6chnvyCWs,478
44
+ castor_extractor/uploader/upload.py,sha256=bTWD1_-hmJ6q1qcEosjZ96wsBtWDnWoCt692NYX_Nko,3228
45
+ castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
46
+ castor_extractor/uploader/utils.py,sha256=Tx_i875L2vJ8btOLV3-L0UMEFiyhH8E5n0XXRyLjO0Y,793
47
47
  castor_extractor/utils/__init__.py,sha256=bmzAOc-PKsVreMJtF7DGpPQeHrVqxWel_BblRftt6Ag,1186
48
48
  castor_extractor/utils/client/__init__.py,sha256=CRE-xJKm6fVV9dB8ljzB5YoOxX4I1sCD1KSgqs3Y8_Y,161
49
49
  castor_extractor/utils/client/abstract.py,sha256=aA5Qcb9TwWDSMq8WpXbGkOB20hehwX2VTpqQAwV76wk,2048
@@ -253,7 +253,7 @@ castor_extractor/visualization/tableau_revamp/constants.py,sha256=PcdudAogQhi3e-
253
253
  castor_extractor/visualization/tableau_revamp/extract.py,sha256=2SLUxp5okM4AcEJJ61ZgcC2ikfZZl9MH17CEXMXmgl0,1450
254
254
  castor_extractor/warehouse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
255
255
  castor_extractor/warehouse/abstract/__init__.py,sha256=Fdfa026tgOo64MvzVRLHM_F2G-JmcehrF0mh3dHgb7s,419
256
- castor_extractor/warehouse/abstract/asset.py,sha256=9nHL4WKUU_vRgj7u3sUdIzgI4rRpdS7YrfwNku4Gz9Q,2652
256
+ castor_extractor/warehouse/abstract/asset.py,sha256=Qs7T2Iw7KHgWVT2aAoBfCQ8tB143cUZY-DRUSkpgvGU,2689
257
257
  castor_extractor/warehouse/abstract/asset_test.py,sha256=_kd4ybNlWSAdSdEgJKC-jhJTa1nMRa9i8RO3YbqKLM4,758
258
258
  castor_extractor/warehouse/abstract/extract.py,sha256=fVBhdE-yMI_g6RBYZcr7q-ZVW7jK7WVkO_GO_KfkRqg,2908
259
259
  castor_extractor/warehouse/abstract/query.py,sha256=GAgeISCmAdrkTKzFGO79hQDf6SA6EFrrlW43w-LiXKo,2632
@@ -370,8 +370,8 @@ castor_extractor/warehouse/synapse/queries/schema.sql,sha256=aX9xNrBD_ydwl-znGSF
370
370
  castor_extractor/warehouse/synapse/queries/table.sql,sha256=mCE8bR1Vb7j7SwZW2gafcXidQ2fo1HwxcybA8wP2Kfs,1049
371
371
  castor_extractor/warehouse/synapse/queries/user.sql,sha256=sTb_SS7Zj3AXW1SggKPLNMCd0qoTpL7XI_BJRMaEpBg,67
372
372
  castor_extractor/warehouse/synapse/queries/view_ddl.sql,sha256=3EVbp5_yTgdByHFIPLHmnoOnqqLE77SrjAwFDvu4e54,249
373
- castor_extractor-0.16.15.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
374
- castor_extractor-0.16.15.dist-info/METADATA,sha256=CsdtS6LQFjsgi0A7tj0sMwtkQVYBye4Savn2DFGBHso,6583
375
- castor_extractor-0.16.15.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
376
- castor_extractor-0.16.15.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
377
- castor_extractor-0.16.15.dist-info/RECORD,,
373
+ castor_extractor-0.17.0.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
374
+ castor_extractor-0.17.0.dist-info/METADATA,sha256=mPiUyxCqXFifcPbhcOPFsnkPAV4OcWXoYzGeUKlbkoo,6582
375
+ castor_extractor-0.17.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
376
+ castor_extractor-0.17.0.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
377
+ castor_extractor-0.17.0.dist-info/RECORD,,