castor-extractor 0.24.0__py3-none-any.whl → 0.24.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.24.1 - 2025-03-14
4
+
5
+ * Added support for Looker Studio
6
+
3
7
  ## 0.24.0 - 2025-03-10
4
8
 
5
9
  * Remove legacy Tableau Connector
README.md CHANGED
@@ -104,7 +104,9 @@ Depending on your use case, you can also install one of the following `extras`:
104
104
 
105
105
  ```bash
106
106
  pip install castor-extractor[bigquery]
107
+ pip install castor-extractor[databricks]
107
108
  pip install castor-extractor[looker]
109
+ pip install castor-extractor[lookerstudio]
108
110
  pip install castor-extractor[metabase]
109
111
  pip install castor-extractor[mysql]
110
112
  pip install castor-extractor[powerbi]
@@ -0,0 +1,30 @@
1
+ from argparse import ArgumentParser
2
+
3
+ from castor_extractor.utils import parse_filled_arguments # type: ignore
4
+ from castor_extractor.visualization import looker_studio # type: ignore
5
+
6
+
7
+ def main():
8
+ parser = ArgumentParser()
9
+ parser.add_argument(
10
+ "-c",
11
+ "--credentials",
12
+ help="File path to Service Account credentials with Looker Studio access",
13
+ )
14
+ parser.add_argument(
15
+ "-a",
16
+ "--admin-email",
17
+ help="Email of a Google Workspace user with admin access",
18
+ )
19
+ parser.add_argument(
20
+ "-b",
21
+ "--bigquery-credentials",
22
+ help=(
23
+ "Optional: file path to Service Account credentials with BigQuery access. "
24
+ "This can be the same file path as for Looker Studio."
25
+ ),
26
+ )
27
+
28
+ parser.add_argument("-o", "--output", help="Directory to write to")
29
+
30
+ looker_studio.extract_all(**parse_filled_arguments(parser))
@@ -4,3 +4,4 @@ from .client import (
4
4
  LookerStudioClient,
5
5
  LookerStudioCredentials,
6
6
  )
7
+ from .extract import extract_all
@@ -3,4 +3,5 @@ from ...types import ExternalAsset
3
3
 
4
4
  class LookerStudioAsset(ExternalAsset):
5
5
  ASSETS = "assets"
6
+ SOURCE_QUERIES = "source_queries"
6
7
  VIEW_ACTIVITY = "view_activity"
@@ -1,21 +1,49 @@
1
- from typing import Iterator
1
+ from typing import Iterator, Optional
2
2
 
3
+ from ....utils import empty_iterator
4
+ from ....warehouse.abstract import WarehouseAsset
5
+ from ....warehouse.bigquery import BigQueryClient, BigQueryQueryBuilder
3
6
  from .. import LookerStudioAsset
4
7
  from .admin_sdk_client import USER_EMAIL_FIELD, AdminSDKClient
5
8
  from .credentials import LookerStudioCredentials
6
9
  from .looker_studio_api_client import LookerStudioAPIClient
7
10
 
8
11
 
12
+ class LookerStudioQueryBuilder(BigQueryQueryBuilder):
13
+ def job_history_queries(self) -> list:
14
+ """
15
+ This class and method are a convenient workaround to build the
16
+ ExtractionQueries which retrieve BigQuery's job history, but filtered on
17
+ Looker Studio only.
18
+
19
+ Compared to the generic BigQuery query history, only the SQL "template"
20
+ changes. By defining this class here, this will pick the SQL file
21
+ `queries/query.sql` located in the same directory as this file.
22
+ """
23
+ return super().build(WarehouseAsset.QUERY) # type: ignore
24
+
25
+
9
26
  class LookerStudioClient:
10
27
  """
11
28
  Acts as a wrapper class to fetch Looker Studio assets, which requires
12
29
  coordinating calls between the Admin SDK API and the Looker Studio API.
30
+
31
+ If the BigQuery credentials are provided, it can also fetch the source queries
32
+ of BigQuery data sources.
13
33
  """
14
34
 
15
- def __init__(self, credentials: LookerStudioCredentials):
35
+ def __init__(
36
+ self,
37
+ credentials: LookerStudioCredentials,
38
+ bigquery_credentials: Optional[dict] = None,
39
+ ):
16
40
  self.admin_sdk_client = AdminSDKClient(credentials)
17
41
  self.looker_studio_client = LookerStudioAPIClient(credentials)
18
42
 
43
+ self.bigquery_client: Optional[BigQueryClient] = None
44
+ if bigquery_credentials:
45
+ self.bigquery_client = BigQueryClient(bigquery_credentials)
46
+
19
47
  def _get_assets(self) -> Iterator[dict]:
20
48
  """
21
49
  Extracts reports and data sources user by user.
@@ -26,12 +54,34 @@ class LookerStudioClient:
26
54
  email = user[USER_EMAIL_FIELD]
27
55
  yield from self.looker_studio_client.fetch_user_assets(email)
28
56
 
29
- def fetch(self, asset: LookerStudioAsset) -> Iterator[dict]:
30
- if asset == LookerStudioAsset.VIEW_ACTIVITY:
31
- yield from self.admin_sdk_client.list_view_events()
57
+ def _get_source_queries(self) -> Iterator[dict]:
58
+ """
59
+ Extracts the BigQuery jobs triggered by Looker Studio. The last job
60
+ per data source is returned.
61
+ """
62
+ if not self.bigquery_client:
63
+ return empty_iterator()
64
+
65
+ query_builder = LookerStudioQueryBuilder(
66
+ regions=self.bigquery_client.get_regions(),
67
+ datasets=self.bigquery_client.get_datasets(),
68
+ extended_regions=self.bigquery_client.get_extended_regions(),
69
+ )
70
+
71
+ queries = query_builder.job_history_queries()
72
+
73
+ for query in queries:
74
+ yield from self.bigquery_client.execute(query)
32
75
 
33
- elif asset == LookerStudioAsset.ASSETS:
76
+ def fetch(self, asset: LookerStudioAsset) -> Iterator[dict]:
77
+ if asset == LookerStudioAsset.ASSETS:
34
78
  yield from self._get_assets()
35
79
 
80
+ elif asset == LookerStudioAsset.SOURCE_QUERIES:
81
+ yield from self._get_source_queries()
82
+
83
+ elif asset == LookerStudioAsset.VIEW_ACTIVITY:
84
+ yield from self.admin_sdk_client.list_view_events()
85
+
36
86
  else:
37
87
  raise ValueError(f"The asset {asset}, is not supported")
@@ -2,6 +2,11 @@ from pydantic import BaseModel, SecretStr, field_serializer
2
2
 
3
3
 
4
4
  class LookerStudioCredentials(BaseModel):
5
+ """
6
+ Looker Studio Credentials match the Service Account credentials JSON
7
+ but with an additional admin_email field.
8
+ """
9
+
5
10
  admin_email: str
6
11
  auth_provider_x509_cert_url: str
7
12
  auth_uri: str
@@ -0,0 +1,52 @@
1
+ /*
2
+ Gets the query jobs triggered by Looker Studio when refreshing a BigQuery data source. Only the latest query per
3
+ data source is selected.
4
+
5
+ The `labels` column should indicate the `looker_studio_datasource_id` that triggered the job. In some cases, it also
6
+ contains a `looker_studio_report_id` value, which gives us a link between the data source and a report.
7
+ */
8
+ WITH ranked_by_datasource AS (
9
+ SELECT
10
+ creation_time,
11
+ project_id AS database_name,
12
+ user_email,
13
+ query AS query_text,
14
+ referenced_tables,
15
+ labels,
16
+ ROW_NUMBER() OVER (
17
+ PARTITION BY (
18
+ SELECT
19
+ label.value
20
+ FROM
21
+ UNNEST(labels) AS label
22
+ WHERE
23
+ label.key = 'looker_studio_datasource_id'
24
+ )
25
+ ORDER BY
26
+ creation_time DESC
27
+ ) AS row_num
28
+ FROM
29
+ `{project}.region-{region}.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
30
+ WHERE
31
+ job_type = 'QUERY'
32
+ AND EXISTS (
33
+ SELECT
34
+ 1
35
+ FROM
36
+ UNNEST(labels) AS label
37
+ WHERE
38
+ label.key = 'requestor'
39
+ AND label.value = 'looker_studio'
40
+ )
41
+ )
42
+ SELECT
43
+ creation_time,
44
+ database_name,
45
+ user_email,
46
+ query_text,
47
+ referenced_tables,
48
+ labels
49
+ FROM
50
+ ranked_by_datasource
51
+ WHERE
52
+ row_num = 1;
@@ -0,0 +1,84 @@
1
+ import json
2
+ import logging
3
+ from collections.abc import Iterable
4
+ from typing import Optional, Union, cast
5
+
6
+ from ...utils import (
7
+ OUTPUT_DIR,
8
+ current_timestamp,
9
+ deep_serialize,
10
+ from_env,
11
+ get_output_filename,
12
+ write_json,
13
+ write_summary,
14
+ )
15
+ from .assets import LookerStudioAsset
16
+ from .client import LookerStudioClient, LookerStudioCredentials
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ APPLICATION_CREDENTIALS = "GOOGLE_APPLICATION_CREDENTIALS"
21
+ LOOKER_STUDIO_ADMIN_EMAIL = "CASTOR_LOOKER_STUDIO_ADMIN_EMAIL"
22
+
23
+
24
+ def iterate_all_data(
25
+ client: LookerStudioClient,
26
+ ) -> Iterable[tuple[LookerStudioAsset, Union[list, dict]]]:
27
+ for asset in LookerStudioAsset:
28
+ logger.info(f"Extracting {asset.name} from API")
29
+ data = list(deep_serialize(client.fetch(asset)))
30
+ yield asset, data
31
+ logger.info(f"Extracted {len(data)} {asset.name} from API")
32
+
33
+
34
+ def _credentials(params: dict) -> LookerStudioCredentials:
35
+ """
36
+ Builds the Looker Studio credentials by combining the Service Account
37
+ credentials with the admin email.
38
+ """
39
+ path = params.get("credentials") or from_env(APPLICATION_CREDENTIALS)
40
+ logger.info(f"Looker Studio credentials loaded from {path}")
41
+ with open(path) as file:
42
+ credentials = cast(dict, json.load(file))
43
+
44
+ admin_email = params.get("admin_email") or from_env(
45
+ LOOKER_STUDIO_ADMIN_EMAIL
46
+ )
47
+ credentials["admin_email"] = admin_email
48
+ return LookerStudioCredentials(**credentials)
49
+
50
+
51
+ def _bigquery_credentials_or_none(params: dict) -> Optional[dict]:
52
+ """Extracts optional GCP credentials to access BigQuery"""
53
+ path = params.get("bigquery_credentials") or from_env(
54
+ APPLICATION_CREDENTIALS,
55
+ allow_missing=True,
56
+ )
57
+ if not path:
58
+ return None
59
+
60
+ logger.info(f"BigQuery credentials loaded from {path}")
61
+ with open(path) as file:
62
+ return cast(dict, json.load(file))
63
+
64
+
65
+ def extract_all(**kwargs) -> None:
66
+ """
67
+ Extracts data from Looker Studio and stores the output files locally under
68
+ the given output_directory.
69
+ """
70
+ output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
71
+ credentials = _credentials(kwargs)
72
+ bigquery_credentials = _bigquery_credentials_or_none(kwargs)
73
+
74
+ client = LookerStudioClient(
75
+ credentials=credentials,
76
+ bigquery_credentials=bigquery_credentials,
77
+ )
78
+ ts = current_timestamp()
79
+
80
+ for key, data in iterate_all_data(client):
81
+ filename = get_output_filename(key.name.lower(), output_directory, ts)
82
+ write_json(filename, data)
83
+
84
+ write_summary(output_directory, ts)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.24.0
3
+ Version: 0.24.1
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -170,7 +170,9 @@ Depending on your use case, you can also install one of the following `extras`:
170
170
 
171
171
  ```bash
172
172
  pip install castor-extractor[bigquery]
173
+ pip install castor-extractor[databricks]
173
174
  pip install castor-extractor[looker]
175
+ pip install castor-extractor[lookerstudio]
174
176
  pip install castor-extractor[metabase]
175
177
  pip install castor-extractor[mysql]
176
178
  pip install castor-extractor[powerbi]
@@ -208,6 +210,10 @@ For any questions or bug report, contact us at [support@castordoc.com](mailto:su
208
210
 
209
211
  # Changelog
210
212
 
213
+ ## 0.24.1 - 2025-03-14
214
+
215
+ * Added support for Looker Studio
216
+
211
217
  ## 0.24.0 - 2025-03-10
212
218
 
213
219
  * Remove legacy Tableau Connector
@@ -1,8 +1,8 @@
1
- CHANGELOG.md,sha256=Ud6kiuDEyG_gu0YAVlp9-oXxriLibyma98FDrlajJJE,15756
1
+ CHANGELOG.md,sha256=uuXk7pDCrTLgVqMxD2SYWDXs3OzvaBfe592stOCjBdg,15815
2
2
  Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
3
3
  DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
4
4
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
5
- README.md,sha256=j8oiToTvFY4eozLUJo4rs0LEqan-G3_eOSP98KFfxfM,3634
5
+ README.md,sha256=GlhxZBs3fkeyYUPjrB4_EGDY8_E0vvnftsaqtgv08vs,3718
6
6
  castor_extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  castor_extractor/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  castor_extractor/commands/extract_bigquery.py,sha256=dU4OiYO1V0n32orvZnMh1_xtFKF_VxHNXcVsH3otY-g,1269
@@ -10,6 +10,7 @@ castor_extractor/commands/extract_confluence.py,sha256=xQjC0VZdz8jFHnugqQ0fQGjzG
10
10
  castor_extractor/commands/extract_databricks.py,sha256=SVKyoa-BBUQAM6HRHf1Wdg9-tpICic2yyvXQwHcNBhA,1264
11
11
  castor_extractor/commands/extract_domo.py,sha256=jvAawUsUTHrwCn_koK6StmQr4n_b5GyvJi6uu6WS0SM,1061
12
12
  castor_extractor/commands/extract_looker.py,sha256=cySLiolLCgrREJ9d0kMrJ7P8K3efHTBTzShalWVfI3A,1214
13
+ castor_extractor/commands/extract_looker_studio.py,sha256=e79gbyTtCexRz5pg_Pp55GWkXJZWjm6NvVclmvcR0lM,916
13
14
  castor_extractor/commands/extract_metabase_api.py,sha256=NXctea4GT_1iRDitY92nV3TKSqhjEUwYSxwPJMRS3iw,786
14
15
  castor_extractor/commands/extract_metabase_db.py,sha256=tYIhTPPgj1mN-07LyWcL6e-YoGp7HCWda58-5Ukyg_I,1255
15
16
  castor_extractor/commands/extract_mode.py,sha256=Q4iO-VAKMg4zFPejhAO-foZibL5Ht3jsnhWKwJ0oqUU,823
@@ -168,17 +169,19 @@ castor_extractor/visualization/looker/extract.py,sha256=O_hzRftww3Cw1cgijL-K-8gh
168
169
  castor_extractor/visualization/looker/fields.py,sha256=7oC7p-3Wp7XHBP_FT_D1wH3kINFRnc_qGVeH1a4UNZY,623
169
170
  castor_extractor/visualization/looker/fields_test.py,sha256=7Cwq8Qky6aTZg8nCHp1gmPJtd9pGNB4QeMIRRWdHo5w,782
170
171
  castor_extractor/visualization/looker/multithreading.py,sha256=Muuh3usBLqtv3sfHoyPYJ6jJ7V5ajR6N9ZJ_F-bNc60,2608
171
- castor_extractor/visualization/looker_studio/__init__.py,sha256=p3mTWz7Yk1_m9vYohxCqwxnuE7SUYbU--TH2ezhf734,142
172
- castor_extractor/visualization/looker_studio/assets.py,sha256=_ir4L2RTmGDb1WetAm6-EZ6W4tPXxi0kNppNBlmy9QE,135
172
+ castor_extractor/visualization/looker_studio/__init__.py,sha256=GccG-GJXoNhjXFPkw-rHHZ0SXVQTFKjqkMIYHVeu3T4,175
173
+ castor_extractor/visualization/looker_studio/assets.py,sha256=lFIqr8EB6eK-Mf80R_x2qAscCyX7ZUcOcHVef1CM9B0,173
173
174
  castor_extractor/visualization/looker_studio/client/__init__.py,sha256=YkQaVDJa-7KSwdOLjtgKJMRiafbGNKC_46YVx0hYZ1Q,129
174
175
  castor_extractor/visualization/looker_studio/client/admin_sdk_client.py,sha256=hYKdU6TlWKkXx07r6HsZ4Wbxhasx8DP_jO6iDCjHjgk,3508
175
- castor_extractor/visualization/looker_studio/client/client.py,sha256=AYdR46NOdn_ITK_wPAASROW0gJjx-iA0Gi43QeuU5BU,1302
176
- castor_extractor/visualization/looker_studio/client/credentials.py,sha256=yzTaiJQ5cArTnbybUPF6fZZXbX9XQ0SBq-jVI2ECovA,521
176
+ castor_extractor/visualization/looker_studio/client/client.py,sha256=6sTfLRUhuxhkqDjC2ZBEaw6YnR6ze8-_VW2rc1u9Ksk,3191
177
+ castor_extractor/visualization/looker_studio/client/credentials.py,sha256=QImJPh8VctkrGt65UiU5hM12JI4WdCMSUFt88aiOoLw,657
177
178
  castor_extractor/visualization/looker_studio/client/endpoints.py,sha256=5eY-ffqNDdlDBOOpiF7LpjyHMrzeClJktidCr1pTDUs,669
178
179
  castor_extractor/visualization/looker_studio/client/enums.py,sha256=fHgemTaQpnwee8cw1YQVDsVnH--vTyFwT4Px8aVYYHQ,167
179
180
  castor_extractor/visualization/looker_studio/client/looker_studio_api_client.py,sha256=oySC6rsppj67RSifxwSCw4bFrz1Irx6IFJhX7tc_v1E,4087
180
181
  castor_extractor/visualization/looker_studio/client/pagination.py,sha256=9HQ3Rkdiz2VB6AvYtZ0F-WouiD0pMmdZyAmkv-3wh08,783
182
+ castor_extractor/visualization/looker_studio/client/queries/query.sql,sha256=Ub4rdrJ5WTPWKI-eVmXrNMv0Ktmti4b-93zZBr0xEB0,1426
181
183
  castor_extractor/visualization/looker_studio/client/scopes.py,sha256=824cqqgZuGq4L-rPNoHJe0ibXsxkRwB0CLG_kqw9Q0g,256
184
+ castor_extractor/visualization/looker_studio/extract.py,sha256=cHyroNZ1fKoBTvIbEebnKDrU3xpkcEgIPJy75ljCL70,2607
182
185
  castor_extractor/visualization/metabase/__init__.py,sha256=3E36cmkMyEgBB6Ot5rWk-N75i0G-7k24QTlc-Iol4pM,193
183
186
  castor_extractor/visualization/metabase/assets.py,sha256=nu3FwQBU_hdS2DBvgXAwQlEEi76QiNK2tMKEtMyctaY,2874
184
187
  castor_extractor/visualization/metabase/client/__init__.py,sha256=KBvaPMofBRV3m_sZAnKNCrJGr-Z88EbpdzEzWPQ_uBk,99
@@ -401,8 +404,8 @@ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=kbBQP-TdG5px1IVgyx
401
404
  castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
402
405
  castor_extractor/warehouse/sqlserver/query.py,sha256=g0hPT-RmeGi2DyenAi3o72cTlQsLToXIFYojqc8E5fQ,533
403
406
  castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
404
- castor_extractor-0.24.0.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
405
- castor_extractor-0.24.0.dist-info/METADATA,sha256=HWTKSDx_akRg3FK_dMP5mRxlLg9Oc55uGUYgt2NmYaQ,22828
406
- castor_extractor-0.24.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
407
- castor_extractor-0.24.0.dist-info/entry_points.txt,sha256=7aVSxc-_2dicp28Ow-S4y0p4wGoTm9zGmVptMvfLdw8,1649
408
- castor_extractor-0.24.0.dist-info/RECORD,,
407
+ castor_extractor-0.24.1.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
408
+ castor_extractor-0.24.1.dist-info/METADATA,sha256=O-v3GkuQVqXbbr5DYt0Tm2uodPoXJd4f6ayR93ywA3M,22971
409
+ castor_extractor-0.24.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
410
+ castor_extractor-0.24.1.dist-info/entry_points.txt,sha256=FQNShG4w4nRO95_bZnagh7FQ2oiZ-40bdt8ZdTW1-uI,1731
411
+ castor_extractor-0.24.1.dist-info/RECORD,,
@@ -4,6 +4,7 @@ castor-extract-confluence=castor_extractor.commands.extract_confluence:main
4
4
  castor-extract-databricks=castor_extractor.commands.extract_databricks:main
5
5
  castor-extract-domo=castor_extractor.commands.extract_domo:main
6
6
  castor-extract-looker=castor_extractor.commands.extract_looker:main
7
+ castor-extract-looker-studio=castor_extractor.commands.extract_looker_studio:main
7
8
  castor-extract-metabase-api=castor_extractor.commands.extract_metabase_api:main
8
9
  castor-extract-metabase-db=castor_extractor.commands.extract_metabase_db:main
9
10
  castor-extract-mode=castor_extractor.commands.extract_mode:main