castor-extractor 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.17.2 - 2024-06-14
4
+
5
+ * Uploader: support multipart
6
+
7
+ ## 0.17.1 - 2024-06-12
8
+
9
+ * Databricks: extract table source links
10
+
3
11
  ## 0.17.0 - 2024-06-10
4
12
 
5
13
  * Uploader: redirect to the proxy, replace credentials with token
@@ -83,7 +83,7 @@ def _upload(
83
83
  response = requests.post(
84
84
  url=url,
85
85
  headers=headers,
86
- data=file_content,
86
+ files={"file": file_content},
87
87
  timeout=timeout,
88
88
  )
89
89
  response.raise_for_status()
@@ -58,7 +58,12 @@ class APIClient:
58
58
 
59
59
  return result.json()
60
60
 
61
- def get(self, path: str, payload: Optional[dict] = None) -> dict:
61
+ def get(
62
+ self,
63
+ path: str,
64
+ payload: Optional[dict] = None,
65
+ processor: Optional[Callable] = None,
66
+ ) -> dict:
62
67
  """path: REST API operation path, such as /api/2.0/clusters/get"""
63
68
  url = self.build_url(self._host, path)
64
- return self._call(url=url, data=payload)
69
+ return self._call(url=url, data=payload, processor=processor)
@@ -5,6 +5,7 @@ from functools import partial
5
5
  from typing import Any, Dict, List, Optional, Set, Tuple, cast
6
6
 
7
7
  import requests
8
+ from requests import Response
8
9
 
9
10
  from ...utils import (
10
11
  SafeMode,
@@ -30,6 +31,7 @@ _RETRY_BASE_MS = 1000
30
31
  _RETRY_EXCEPTIONS = [
31
32
  requests.exceptions.ConnectTimeout,
32
33
  ]
34
+ _WORKSPACE_ID_HEADER = "X-Databricks-Org-Id"
33
35
 
34
36
  safe_params = SafeMode((BaseException,), _MAX_NUMBER_OF_LINEAGE_ERRORS)
35
37
 
@@ -120,15 +122,28 @@ class DatabricksClient(APIClient):
120
122
  for schema in self._schemas_of_database(database)
121
123
  ]
122
124
 
125
+ @staticmethod
126
+ def _process_table_response(response: Response) -> Tuple[dict, str]:
127
+ """
128
+ Returns both the JSON content and the Workspace ID, which is found
129
+ in the response's headers.
130
+ """
131
+ return response.json(), response.headers[_WORKSPACE_ID_HEADER]
132
+
123
133
  def _tables_columns_of_schema(self, schema: dict) -> TablesColumns:
124
134
  path = "api/2.1/unity-catalog/tables"
125
135
  payload = {
126
136
  "catalog_name": schema["database_id"],
127
137
  "schema_name": schema["schema_name"],
128
138
  }
129
- content = self.get(path=path, payload=payload)
139
+ content, workspace_id = self.get(
140
+ path=path,
141
+ payload=payload,
142
+ processor=self._process_table_response,
143
+ )
144
+ host = self.build_url(self._host, path="")
130
145
  return self.formatter.format_table_column(
131
- content.get("tables", []), schema
146
+ content.get("tables", []), schema, host, workspace_id
132
147
  )
133
148
 
134
149
  @staticmethod
@@ -9,6 +9,8 @@ logger = logging.getLogger(__name__)
9
9
  EXCLUDED_DATABASES = {"system"}
10
10
  EXCLUDED_SCHEMAS = {"information_schema", "default"}
11
11
 
12
+ TABLE_URL_TPL = "{host}explore/data/{catalog_name}/{schema_name}/{table_name}?o={workspace_id}"
13
+
12
14
 
13
15
  def _to_datetime_or_none(time_ms: Optional[int]) -> Optional[datetime]:
14
16
  """return time in ms as datetime or None"""
@@ -17,7 +19,23 @@ def _to_datetime_or_none(time_ms: Optional[int]) -> Optional[datetime]:
17
19
  return datetime.fromtimestamp(time_ms / 1000.0)
18
20
 
19
21
 
20
- def _table_payload(schema: dict, table: dict) -> dict:
22
+ def _table_payload(
23
+ schema: dict,
24
+ table: dict,
25
+ host: str,
26
+ workspace_id: str,
27
+ ) -> dict:
28
+ """
29
+ Prepares the table payload. This also includes a source link which is built
30
+ here using the host and workspace_id.
31
+ """
32
+ url = TABLE_URL_TPL.format(
33
+ host=host,
34
+ catalog_name=table["catalog_name"],
35
+ schema_name=table["schema_name"],
36
+ table_name=table["name"],
37
+ workspace_id=workspace_id,
38
+ )
21
39
  return {
22
40
  "description": table.get("comment"),
23
41
  "id": table["table_id"],
@@ -26,6 +44,7 @@ def _table_payload(schema: dict, table: dict) -> dict:
26
44
  "table_name": table["name"],
27
45
  "tags": [],
28
46
  "type": table.get("table_type"),
47
+ "url": url,
29
48
  }
30
49
 
31
50
 
@@ -78,14 +97,14 @@ class DatabricksFormatter:
78
97
 
79
98
  @staticmethod
80
99
  def format_table_column(
81
- raw_tables: List[dict], schema: dict
100
+ raw_tables: List[dict], schema: dict, host: str, workspace_id: str
82
101
  ) -> TablesColumns:
83
102
  tables = []
84
103
  columns = []
85
104
  if not raw_tables:
86
105
  return [], []
87
106
  for table in raw_tables:
88
- t = _table_payload(schema, table)
107
+ t = _table_payload(schema, table, host, workspace_id)
89
108
  tables.append(t)
90
109
  if not table.get("columns"):
91
110
  continue
@@ -1,6 +1,11 @@
1
1
  from datetime import datetime
2
2
 
3
- from .format import DatabricksFormatter, _column_payload, _to_datetime_or_none
3
+ from .format import (
4
+ DatabricksFormatter,
5
+ _column_payload,
6
+ _table_payload,
7
+ _to_datetime_or_none,
8
+ )
4
9
 
5
10
 
6
11
  def test__to_datetime_or_none():
@@ -23,6 +28,35 @@ def test_DatabricksFormatter__primary():
23
28
  assert DatabricksFormatter._primary([]) is None
24
29
 
25
30
 
31
+ def test__table_payload():
32
+ schema = {"id": "id123"}
33
+
34
+ table = {
35
+ "name": "baz",
36
+ "catalog_name": "foo",
37
+ "schema_name": "bar",
38
+ "table_type": "MANAGED",
39
+ "owner": "pot@ato.com",
40
+ "table_id": "732pot5e-8ato-4c27-b701-9fa51febc192",
41
+ }
42
+ host = "https://some.cloud.databricks.net/"
43
+ workspace_id = "123456"
44
+
45
+ payload = _table_payload(schema, table, host, workspace_id)
46
+
47
+ expected = {
48
+ "description": None,
49
+ "id": "732pot5e-8ato-4c27-b701-9fa51febc192",
50
+ "owner_email": "pot@ato.com",
51
+ "schema_id": "id123",
52
+ "table_name": "baz",
53
+ "tags": [],
54
+ "type": "MANAGED",
55
+ "url": "https://some.cloud.databricks.net/explore/data/foo/bar/baz?o=123456",
56
+ }
57
+ assert payload == expected
58
+
59
+
26
60
  def test__column_payload():
27
61
  table = {
28
62
  "id": "18175cd5-9b9b-4d78-9d28-caaa12c21ce0",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.17.0
3
+ Version: 0.17.2
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=EVZ9vhIVN7HLn5PYkRyBWyT3hk72Nt3i1SghwSipfR4,10957
1
+ CHANGELOG.md,sha256=qR-os6GsyPBBkE3MwBCB4nYlMULY-D0vqKLXDcamyMU,11078
2
2
  Dockerfile,sha256=HcX5z8OpeSvkScQsN-Y7CNMUig_UB6vTMDl7uqzuLGE,303
3
3
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
4
4
  README.md,sha256=uF6PXm9ocPITlKVSh9afTakHmpLx3TvawLf-CbMP3wM,3578
@@ -41,13 +41,13 @@ castor_extractor/uploader/__init__.py,sha256=SSRtwjg-dNoxME-RJy9G1flASiUKAC5bH1h
41
41
  castor_extractor/uploader/constant.py,sha256=yTigLHDlYwoRr6CpFIl7ReElFsQd4H-qkluMZJPWSx0,865
42
42
  castor_extractor/uploader/env.py,sha256=5HSniVSOYVg4u38O4k8TB_qaJq9s8yJ1hjedkq_gdVg,878
43
43
  castor_extractor/uploader/env_test.py,sha256=ClCWWtwd2N-5ClIDUxVMeKkWfhhOTxpppsXUDmdjxSg,472
44
- castor_extractor/uploader/upload.py,sha256=bTWD1_-hmJ6q1qcEosjZ96wsBtWDnWoCt692NYX_Nko,3228
44
+ castor_extractor/uploader/upload.py,sha256=W1TGqO8_PtFdR661qNlh6v-LOIRvoJoda65-5OujFXs,3239
45
45
  castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
46
46
  castor_extractor/uploader/utils.py,sha256=Tx_i875L2vJ8btOLV3-L0UMEFiyhH8E5n0XXRyLjO0Y,793
47
47
  castor_extractor/utils/__init__.py,sha256=bmzAOc-PKsVreMJtF7DGpPQeHrVqxWel_BblRftt6Ag,1186
48
48
  castor_extractor/utils/client/__init__.py,sha256=CRE-xJKm6fVV9dB8ljzB5YoOxX4I1sCD1KSgqs3Y8_Y,161
49
49
  castor_extractor/utils/client/abstract.py,sha256=aA5Qcb9TwWDSMq8WpXbGkOB20hehwX2VTpqQAwV76wk,2048
50
- castor_extractor/utils/client/api.py,sha256=z1o4fteWx1HxNTqCYihl9sGkIgSQTbd8lW_B9Y2wyeQ,1742
50
+ castor_extractor/utils/client/api.py,sha256=AGDj2JH__Q_x7RQdodoVazGvjGQJ9TzNqs-XEX6Hrms,1840
51
51
  castor_extractor/utils/client/api_test.py,sha256=NSMdXg1FLc37erqHp2FZsIsogWVv6lFSs7rDXHikr-E,542
52
52
  castor_extractor/utils/client/postgres.py,sha256=n6ulaT222WWPY0_6qAZ0MHF0m91HtI9mMqL71nyygo0,866
53
53
  castor_extractor/utils/client/query.py,sha256=O6D5EjD1KmBlwa786Uw4D4kzxx97_HH50xIIeSWt0B8,205
@@ -277,12 +277,12 @@ castor_extractor/warehouse/bigquery/queries/view_ddl.sql,sha256=obCm-IN9V8_YSZTw
277
277
  castor_extractor/warehouse/bigquery/query.py,sha256=hrFfjd5jW2oQnZ6ozlkn-gDe6sCIzu5zSX19T9W6fIk,4162
278
278
  castor_extractor/warehouse/bigquery/types.py,sha256=LZVWSmE57lOemNbB5hBRyYmDk9bFAU4nbRaJWALl6N8,140
279
279
  castor_extractor/warehouse/databricks/__init__.py,sha256=bTvDxjGQGM2J3hOnVhfNmFP1y8DK0tySiD_EXe5_xWE,200
280
- castor_extractor/warehouse/databricks/client.py,sha256=oHR_htE25p5tiAAFZKbF48efo7tqIENW4dAGA7yEqHg,16895
280
+ castor_extractor/warehouse/databricks/client.py,sha256=sNY-7FDg9nLaqw2zk_aoGAhSGi8KST9QpHAHn46700w,17439
281
281
  castor_extractor/warehouse/databricks/client_test.py,sha256=KNp4Hi_CC6GwiW2QDJQQwqALfUebuT9D_qL6FuP_8tY,5246
282
282
  castor_extractor/warehouse/databricks/credentials.py,sha256=PpGv5_GP320UQjV_gvaxSpOw58AmqSznmjGhGfe6bdU,655
283
283
  castor_extractor/warehouse/databricks/extract.py,sha256=VX-3uo5dZucenrg-wnPur3CxOgpC5H7Ds92TO7OTAjc,7379
284
- castor_extractor/warehouse/databricks/format.py,sha256=2bRy2fa45NW3uk030rmyba4n2Em-NnyZPBurUslEbcw,5522
285
- castor_extractor/warehouse/databricks/format_test.py,sha256=iPmdJof43fBYL1Sa_fBrCWDQHCHgm7IWCZag1kWkj9E,1970
284
+ castor_extractor/warehouse/databricks/format.py,sha256=zSO3Cm-vpidzNA07W81I506u-ToQzkjXVwKDmS-tfiE,6088
285
+ castor_extractor/warehouse/databricks/format_test.py,sha256=HZvJjcB7sj7LF1kIxAeaf_KdD3XOKn9nfeQLRT39G3s,2804
286
286
  castor_extractor/warehouse/databricks/test_constants.py,sha256=Hm96yq_ltVAKv7WYhYz637r4Cuj-1cCdyOuxMEe3J-Q,2246
287
287
  castor_extractor/warehouse/databricks/types.py,sha256=hD6gC8oiT3QSWEvbtgUOGK_lLzzz36sEauB3lS_wxlE,218
288
288
  castor_extractor/warehouse/mysql/__init__.py,sha256=2KFDogo9GNbApHqw3Vm5t_uNmIRjdp76nmP_WQQMfQY,116
@@ -370,8 +370,8 @@ castor_extractor/warehouse/synapse/queries/schema.sql,sha256=aX9xNrBD_ydwl-znGSF
370
370
  castor_extractor/warehouse/synapse/queries/table.sql,sha256=mCE8bR1Vb7j7SwZW2gafcXidQ2fo1HwxcybA8wP2Kfs,1049
371
371
  castor_extractor/warehouse/synapse/queries/user.sql,sha256=sTb_SS7Zj3AXW1SggKPLNMCd0qoTpL7XI_BJRMaEpBg,67
372
372
  castor_extractor/warehouse/synapse/queries/view_ddl.sql,sha256=3EVbp5_yTgdByHFIPLHmnoOnqqLE77SrjAwFDvu4e54,249
373
- castor_extractor-0.17.0.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
374
- castor_extractor-0.17.0.dist-info/METADATA,sha256=mPiUyxCqXFifcPbhcOPFsnkPAV4OcWXoYzGeUKlbkoo,6582
375
- castor_extractor-0.17.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
376
- castor_extractor-0.17.0.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
377
- castor_extractor-0.17.0.dist-info/RECORD,,
373
+ castor_extractor-0.17.2.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
374
+ castor_extractor-0.17.2.dist-info/METADATA,sha256=V-1vK-HPqqZxUKCRncvmp__Yg4UA3B1Aza7Ac4uVFrA,6582
375
+ castor_extractor-0.17.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
376
+ castor_extractor-0.17.2.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
377
+ castor_extractor-0.17.2.dist-info/RECORD,,