castor-extractor 0.18.5__py3-none-any.whl → 0.18.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.18.7 - 2024-08-01
4
+
5
+ * Salesforce: extract table descriptions
6
+
7
+ ## 0.18.6 - 2024-07-30
8
+
9
+ * BigQuery: introduce extended regions to extract missing queries
10
+
3
11
  ## 0.18.5 - 2024-07-17
4
12
 
5
13
  * Salesforce: extract DeveloperName and tooling url
@@ -1,13 +1,14 @@
1
+ import itertools
1
2
  import logging
2
- from typing import List, Optional, Set, Tuple
3
+ from typing import List, Optional, Set
3
4
 
4
- from google.api_core.exceptions import Forbidden
5
- from google.api_core.page_iterator import Iterator as PageIterator
5
+ from google.api_core.exceptions import Forbidden # type: ignore
6
6
  from google.cloud.bigquery import Client as GoogleCloudClient # type: ignore
7
7
  from google.cloud.bigquery.dataset import Dataset # type: ignore
8
8
  from google.oauth2.service_account import Credentials # type: ignore
9
9
 
10
10
  from ...utils import SqlalchemyClient, retry
11
+ from .types import SetTwoString
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
@@ -117,16 +118,50 @@ class BigQueryClient(SqlalchemyClient):
117
118
  ]
118
119
  return self._projects
119
120
 
120
- def get_regions(self) -> Set[Tuple[str, str]]:
121
+ def get_regions(self) -> SetTwoString:
121
122
  """
122
- Returns distinct (project_id, region) available for the given GCP client
123
+ Returns (project_id, region) available for the given GCP client
124
+ - Loops trough projects -> datasets -> region
125
+ - Returns distinct values
126
+ Example:
127
+ project_A
128
+ -> dataset_1:US
129
+ project_B
130
+ -> empty
131
+ project_C
132
+ -> dataset_2:EU
133
+ -> dataset_3:EU
134
+ Will return:
135
+ { (p_A, US), (p_C, EU) }
123
136
  """
124
137
  return {
125
138
  (ds.project, ds._properties["location"])
126
139
  for ds in self._list_datasets()
127
140
  }
128
141
 
129
- def get_datasets(self) -> Set[Tuple[str, str]]:
142
+ def get_extended_regions(self) -> SetTwoString:
143
+ """
144
+ Returns all combinations of (project_id, region) for the given client
145
+ - Fetch all projects
146
+ - Fetch all regions (cross projects)
147
+ - Returns a combination of the two lists
148
+ Example:
149
+ project_A
150
+ -> dataset_1:US
151
+ project_B
152
+ -> empty
153
+ project_C
154
+ -> dataset_2:EU
155
+ -> dataset_3:EU
156
+ Will return:
157
+ { (p_A, EU), (p_A, US), (p_B, EU), (p_B, US), (p_C, EU), (p_C, US) }
158
+ """
159
+ projects = self.get_projects()
160
+ regions = {ds._properties["location"] for ds in self._list_datasets()}
161
+ combinations = itertools.product(projects, regions)
162
+ return set(combinations)
163
+
164
+ def get_datasets(self) -> SetTwoString:
130
165
  """
131
166
  Returns distinct (project_id, dataset_id) available for the given GCP client
132
167
  """
@@ -68,6 +68,7 @@ def extract_all(**kwargs) -> None:
68
68
  query_builder = BigQueryQueryBuilder(
69
69
  regions=client.get_regions(),
70
70
  datasets=client.get_datasets(),
71
+ extended_regions=client.get_extended_regions(),
71
72
  )
72
73
 
73
74
  storage = LocalStorage(directory=output_directory)
@@ -2,18 +2,16 @@ import logging
2
2
  from typing import List, Optional
3
3
 
4
4
  from ..abstract import (
5
- QUERIES_DIR,
6
5
  AbstractQueryBuilder,
7
6
  ExtractionQuery,
8
7
  TimeFilter,
9
8
  WarehouseAsset,
10
9
  )
11
-
12
- # Those queries must be formatted with {region}
13
- from .types import IterTwoString
10
+ from .types import SetTwoString
14
11
 
15
12
  logger = logging.getLogger(__name__)
16
13
 
14
+ # Those queries must be formatted with {region}
17
15
  REGION_REQUIRED = (
18
16
  WarehouseAsset.COLUMN,
19
17
  WarehouseAsset.DATABASE,
@@ -23,6 +21,11 @@ REGION_REQUIRED = (
23
21
  WarehouseAsset.USER,
24
22
  )
25
23
 
24
+ # Some clients use empty projects (no datasets) to run their SQL queries
25
+ # The extended regions is a combination of all regions with all projects
26
+ # It allows to extract those queries which were left apart before
27
+ EXTENDED_REGION_REQUIRED = (WarehouseAsset.QUERY,)
28
+
26
29
  # Those queries must be formatted with {dataset}
27
30
  DATASET_REQUIRED = (WarehouseAsset.VIEW_DDL,)
28
31
 
@@ -38,7 +41,7 @@ SHARDED_ASSETS = (WarehouseAsset.TABLE, WarehouseAsset.COLUMN)
38
41
  SHARDED_FILE_PATH = "cte/sharded.sql"
39
42
 
40
43
 
41
- def _database_formatted(datasets: IterTwoString) -> str:
44
+ def _database_formatted(datasets: SetTwoString) -> str:
42
45
  databases = {db for _, db in datasets}
43
46
  if not databases:
44
47
  # when no datasets are provided condition should pass
@@ -55,10 +58,11 @@ class BigQueryQueryBuilder(AbstractQueryBuilder):
55
58
 
56
59
  def __init__(
57
60
  self,
58
- regions: IterTwoString,
59
- datasets: IterTwoString,
61
+ regions: SetTwoString,
62
+ datasets: SetTwoString,
60
63
  time_filter: Optional[TimeFilter] = None,
61
64
  sync_tags: Optional[bool] = False,
65
+ extended_regions: Optional[SetTwoString] = None,
62
66
  ):
63
67
  super().__init__(
64
68
  time_filter=time_filter,
@@ -67,6 +71,7 @@ class BigQueryQueryBuilder(AbstractQueryBuilder):
67
71
  self._regions = regions
68
72
  self._datasets = datasets
69
73
  self._sync_tags = sync_tags
74
+ self._extended_regions = extended_regions or regions
70
75
 
71
76
  @staticmethod
72
77
  def _format(query: ExtractionQuery, values: dict) -> ExtractionQuery:
@@ -97,6 +102,13 @@ class BigQueryQueryBuilder(AbstractQueryBuilder):
97
102
  sharded_statement = self._load_from_file(SHARDED_FILE_PATH)
98
103
  return statement.format(sharded_statement=sharded_statement)
99
104
 
105
+ def _get_regions(self, asset: WarehouseAsset) -> SetTwoString:
106
+ return (
107
+ self._extended_regions
108
+ if asset in EXTENDED_REGION_REQUIRED
109
+ else self._regions
110
+ )
111
+
100
112
  def build(self, asset: WarehouseAsset) -> List[ExtractionQuery]:
101
113
  """
102
114
  It would be easier to stitch data directly in the query statement (UNION ALL).
@@ -110,12 +122,14 @@ class BigQueryQueryBuilder(AbstractQueryBuilder):
110
122
  query = super().build_default(asset)
111
123
 
112
124
  if asset in REGION_REQUIRED:
125
+ regions = self._get_regions(asset)
126
+
113
127
  logger.info(
114
- f"\tWill run queries with following region params: {self._regions}",
128
+ f"\tWill run queries with following region params: {regions}",
115
129
  )
116
130
  return [
117
131
  self._format(query, {"project": project, "region": region})
118
- for project, region in self._regions
132
+ for project, region in regions
119
133
  ]
120
134
 
121
135
  if asset in DATASET_REQUIRED:
@@ -1,5 +1,4 @@
1
- from typing import Iterable, Set, Tuple
1
+ from typing import Set, Tuple
2
2
 
3
3
  SetString = Set[str]
4
4
  SetTwoString = Set[Tuple[str, str]]
5
- IterTwoString = Iterable[Tuple[str, str]]
@@ -1,11 +1,15 @@
1
1
  import logging
2
- from typing import Dict, Iterator, List, Tuple
2
+ from typing import Dict, Iterator, List, Optional, Tuple
3
3
 
4
4
  from tqdm import tqdm # type: ignore
5
5
 
6
6
  from ...utils.salesforce import SalesforceBaseClient, SalesforceCredentials
7
7
  from .format import SalesforceFormatter
8
- from .soql import SOBJECT_FIELDS_QUERY_TPL, SOBJECTS_QUERY_TPL
8
+ from .soql import (
9
+ DESCRIPTION_QUERY_TPL,
10
+ SOBJECT_FIELDS_QUERY_TPL,
11
+ SOBJECTS_QUERY_TPL,
12
+ )
9
13
 
10
14
  logger = logging.getLogger(__name__)
11
15
 
@@ -90,13 +94,34 @@ class SalesforceClient(SalesforceBaseClient):
90
94
  response = self._call(self.tooling_url, params={"q": query})
91
95
  return response["records"]
92
96
 
97
+ def fetch_description(self, table_name: str) -> Optional[str]:
98
+ """Retrieve description of a table"""
99
+ query = DESCRIPTION_QUERY_TPL.format(table_name=table_name)
100
+ response = self._call(self.tooling_url, params={"q": query})
101
+ if not response["records"]:
102
+ return None
103
+ return response["records"][0]["Description"]
104
+
105
+ def add_table_descriptions(self, sobjects: List[dict]) -> List[dict]:
106
+ """
107
+ Add table descriptions.
108
+ We use the tooling API which does not handle well the LIMIT in SOQL
109
+ so we have to retrieve descriptions individually
110
+ """
111
+ described_sobjects = []
112
+ for sobject in sobjects:
113
+ description = self.fetch_description(sobject["QualifiedApiName"])
114
+ described_sobjects.append({**sobject, "Description": description})
115
+ return described_sobjects
116
+
93
117
  def tables(self) -> List[dict]:
94
118
  """
95
119
  Get Salesforce sobjects as tables
96
120
  """
97
121
  sobjects = self.fetch_sobjects()
98
122
  logger.info(f"Extracted {len(sobjects)} sobjects")
99
- return list(self.formatter.tables(sobjects))
123
+ described_sobjects = self.add_table_descriptions(sobjects)
124
+ return list(self.formatter.tables(described_sobjects))
100
125
 
101
126
  def columns(
102
127
  self, sobject_names: List[Tuple[str, str]], show_progress: bool = True
@@ -44,7 +44,7 @@ def _to_table_payload(sobject: dict, table_name: str) -> dict:
44
44
  "label": sobject["Label"],
45
45
  "schema_id": SCHEMA_NAME,
46
46
  "table_name": table_name,
47
- "description": "",
47
+ "description": sobject.get("Description"),
48
48
  "tags": [],
49
49
  "type": "TABLE",
50
50
  }
@@ -59,7 +59,7 @@ def test__merge_label_and_api_name():
59
59
  "label": "foo",
60
60
  "schema_id": SCHEMA_NAME,
61
61
  "table_name": expected_name,
62
- "description": "",
62
+ "description": None,
63
63
  "tags": [],
64
64
  "type": "TABLE",
65
65
  }
@@ -1,3 +1,9 @@
1
+ DESCRIPTION_QUERY_TPL = """
2
+ SELECT Description
3
+ FROM EntityDefinition
4
+ WHERE QualifiedApiName = '{table_name}'
5
+ """
6
+
1
7
  SOBJECTS_QUERY_TPL = """
2
8
  SELECT
3
9
  DeveloperName,
@@ -13,7 +19,6 @@ SOBJECTS_QUERY_TPL = """
13
19
  LIMIT {limit}
14
20
  """
15
21
 
16
-
17
22
  SOBJECT_FIELDS_QUERY_TPL = """
18
23
  SELECT
19
24
  DataType,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.18.5
3
+ Version: 0.18.7
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=JT2Xz-gVL-ASKBhU8Hxo07ks_as2hr8MpMjD9APvET8,11835
1
+ CHANGELOG.md,sha256=uL2xlPDomxLQhjD55aaCwT6ItdW_ziWGTIjjOAF0IXo,11992
2
2
  Dockerfile,sha256=HcX5z8OpeSvkScQsN-Y7CNMUig_UB6vTMDl7uqzuLGE,303
3
3
  DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
4
4
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -263,10 +263,10 @@ castor_extractor/warehouse/abstract/query.py,sha256=GAgeISCmAdrkTKzFGO79hQDf6SA6
263
263
  castor_extractor/warehouse/abstract/time_filter.py,sha256=bggIONfMmUxffkA6TwM3BsjfS2l9WFxPq8krfsau5pw,935
264
264
  castor_extractor/warehouse/abstract/time_filter_test.py,sha256=PIkegB7KOKBdpc6zIvmyl_CeQyADeFDplyQ8HTNU5LA,448
265
265
  castor_extractor/warehouse/bigquery/__init__.py,sha256=PCGNYdi7dHv-SyanUWzRuBp-ypuQ01PkDaQjVnaNhbM,170
266
- castor_extractor/warehouse/bigquery/client.py,sha256=ypLKXvvfR0RtKex4T2mNvoef4T-jRF1T_RZGCZ6qbOM,4495
266
+ castor_extractor/warehouse/bigquery/client.py,sha256=UefTefQp0S9kpRQzIzJhlm6VcH5uoCRAHHCgfRB6I58,5606
267
267
  castor_extractor/warehouse/bigquery/client_test.py,sha256=Ym8e4d--0YQwiVcNUnXLx0X-X6ZznwNMBMbMaDS5oEA,1514
268
268
  castor_extractor/warehouse/bigquery/credentials.py,sha256=oCZ8H7qpudKzwM7PRMpVAmWXt7bjIRa8Harmp-ysQJ4,425
269
- castor_extractor/warehouse/bigquery/extract.py,sha256=vZFxJC1LtUMph5UhfhYdJLnsEto18IOERKzrt71jqJg,2883
269
+ castor_extractor/warehouse/bigquery/extract.py,sha256=lwiam_9YsaUumoY4OwgRMMV4OwLhPwAwnQNCkt-JIeo,2939
270
270
  castor_extractor/warehouse/bigquery/queries/.sqlfluff,sha256=ce8UDW2k39v6RBVxgKqjOHHYMoGN9S9f7BCZNHHhox8,30
271
271
  castor_extractor/warehouse/bigquery/queries/column.sql,sha256=NxdTnHwomHTEGSc-UoXFKUwg59I9XAOwrSau7JUqGQE,1815
272
272
  castor_extractor/warehouse/bigquery/queries/cte/sharded.sql,sha256=-G7_4lxV7UPe72mYlp4HDGeM_fJjZWuXJ7Q0vxvj5_U,1454
@@ -277,8 +277,8 @@ castor_extractor/warehouse/bigquery/queries/table.sql,sha256=D15UNem03Bfcy0JYvKT
277
277
  castor_extractor/warehouse/bigquery/queries/table_with_tags.sql,sha256=mhWQHaLgyumtdJX5XyEbdrn_Qtt-RCu4cH1WLM6TN9o,2660
278
278
  castor_extractor/warehouse/bigquery/queries/user.sql,sha256=l-fkNGWJVdZwVhbFZL23B8tve-UKXAI6HRlnQq0gIwM,192
279
279
  castor_extractor/warehouse/bigquery/queries/view_ddl.sql,sha256=obCm-IN9V8_YSZTwcgNSBDD0ZXPgRjlxJjrZDSEH2MU,326
280
- castor_extractor/warehouse/bigquery/query.py,sha256=hrFfjd5jW2oQnZ6ozlkn-gDe6sCIzu5zSX19T9W6fIk,4162
281
- castor_extractor/warehouse/bigquery/types.py,sha256=LZVWSmE57lOemNbB5hBRyYmDk9bFAU4nbRaJWALl6N8,140
280
+ castor_extractor/warehouse/bigquery/query.py,sha256=5Qc8PEa-kQKpTzguj4RNCAwKyvzWt20vAESYNB0lueo,4768
281
+ castor_extractor/warehouse/bigquery/types.py,sha256=DHK3wUaaLyLMp7LP-7QkXTDYpYTZiPtvptAOkpxgp4g,88
282
282
  castor_extractor/warehouse/databricks/__init__.py,sha256=YG3YSIJgCFRjjI8eExy9T7qGnfnjWhMFh8c15KTs_BA,184
283
283
  castor_extractor/warehouse/databricks/client.py,sha256=ohqsCyLdUJCJGTBK-IBHCV2BUujPG2VsOkc8qAapaPk,20636
284
284
  castor_extractor/warehouse/databricks/client_test.py,sha256=KNp4Hi_CC6GwiW2QDJQQwqALfUebuT9D_qL6FuP_8tY,5246
@@ -329,12 +329,12 @@ castor_extractor/warehouse/redshift/queries/user.sql,sha256=sEXveJAuNvZacvpI6Wfw
329
329
  castor_extractor/warehouse/redshift/queries/view_ddl.sql,sha256=Pkyh_QT6d4rhTeyiVcqw6O8CRl7NEhk2p7eM5YIn5kg,719
330
330
  castor_extractor/warehouse/redshift/query.py,sha256=0C81rkt2cpkWrJIxxwALDyqr-49vlqQM04y_N6wwStc,540
331
331
  castor_extractor/warehouse/salesforce/__init__.py,sha256=NR4aNea5jeE1xYqeZ_29deeN84CkN0_D_Z7CLQdJvFY,137
332
- castor_extractor/warehouse/salesforce/client.py,sha256=ETnZ3n-GFFH0XohDB2ft74wI1HMspvTefR3k7ne-pmI,3891
332
+ castor_extractor/warehouse/salesforce/client.py,sha256=F3UdD5-9umEU-u_c7uVtaksg81VZeXRW83BVsFvYDkE,4902
333
333
  castor_extractor/warehouse/salesforce/constants.py,sha256=GusduVBCPvwpk_Im6F3bDvXeNQ7hRnCMdIAjIg65RnE,52
334
334
  castor_extractor/warehouse/salesforce/extract.py,sha256=GaxkGWhdksDT_rlT24KX8DMpWnhKlhDMAUvBPGalli0,3454
335
- castor_extractor/warehouse/salesforce/format.py,sha256=GsWrMEPCMSjP8XuMpDmC5SEMrSp0_xmbCCYQ8PfQebg,3265
336
- castor_extractor/warehouse/salesforce/format_test.py,sha256=HBlAYBoCOHaq_QOFudZlpcZb5TyZWV9v-cxK4tklg50,2253
337
- castor_extractor/warehouse/salesforce/soql.py,sha256=pAEaJE8ZUcyN3ptBsZGzNcGRhCcU81X6RMlnF1HRMw4,1063
335
+ castor_extractor/warehouse/salesforce/format.py,sha256=eiPM_4i_m3FEg_2jkMYlhaRBg3gTvV-9xQuk8ghJZiM,3289
336
+ castor_extractor/warehouse/salesforce/format_test.py,sha256=aWUUYDAX-hN1XQJHlv6ZtI2noXWjRobV-9zdjiXR5n4,2255
337
+ castor_extractor/warehouse/salesforce/soql.py,sha256=XB8ohKwHFfC4Xger7Y84DXLW17IJDye_bZ3FL6DCcOI,1188
338
338
  castor_extractor/warehouse/snowflake/__init__.py,sha256=TEGXTyxWp4Tr9gIHb-UFVTRKj6YWmrRtqHruiKSZGiY,174
339
339
  castor_extractor/warehouse/snowflake/client.py,sha256=XT0QLVNff_586SDuMe40iu8FCwPDh2uBV5aKc1Ql914,5555
340
340
  castor_extractor/warehouse/snowflake/client_test.py,sha256=ihWtOOAQfh8pu5JTr_EWfqefKOVIaJXznACURzaU1Qs,1432
@@ -375,8 +375,8 @@ castor_extractor/warehouse/synapse/queries/schema.sql,sha256=aX9xNrBD_ydwl-znGSF
375
375
  castor_extractor/warehouse/synapse/queries/table.sql,sha256=mCE8bR1Vb7j7SwZW2gafcXidQ2fo1HwxcybA8wP2Kfs,1049
376
376
  castor_extractor/warehouse/synapse/queries/user.sql,sha256=sTb_SS7Zj3AXW1SggKPLNMCd0qoTpL7XI_BJRMaEpBg,67
377
377
  castor_extractor/warehouse/synapse/queries/view_ddl.sql,sha256=3EVbp5_yTgdByHFIPLHmnoOnqqLE77SrjAwFDvu4e54,249
378
- castor_extractor-0.18.5.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
379
- castor_extractor-0.18.5.dist-info/METADATA,sha256=MCiHMiLGVUz69hRClDJHsMECllqppMC4yZkg-22ETNM,7209
380
- castor_extractor-0.18.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
381
- castor_extractor-0.18.5.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
382
- castor_extractor-0.18.5.dist-info/RECORD,,
378
+ castor_extractor-0.18.7.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
379
+ castor_extractor-0.18.7.dist-info/METADATA,sha256=QCkJjnGnFsOfbZ808-jK6dthnNFKqzC_YLydG1sUSqM,7209
380
+ castor_extractor-0.18.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
381
+ castor_extractor-0.18.7.dist-info/entry_points.txt,sha256=SbyPk58Gh-FRztfCNnUZQ6w7SatzNJFZ6GIJLNsy7tI,1427
382
+ castor_extractor-0.18.7.dist-info/RECORD,,