castor-extractor 0.20.4__py3-none-any.whl → 0.20.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

CHANGELOG.md CHANGED
@@ -1,6 +1,10 @@
1
1
 
2
2
  # Changelog
3
3
 
4
+ ## 0.20.5 - 2024-10-09
5
+
6
+ * Redshift: enable extraction from a Redshift Serverless instance
7
+
4
8
  ## 0.20.4 - 2024-10-09
5
9
 
6
10
  * Salesforce warehouse: `Labels` instead of `api_names` for columns
@@ -23,6 +23,11 @@ def main():
23
23
  action="store_true",
24
24
  help="Skips files already extracted instead of replacing them",
25
25
  )
26
+ parser.add_argument(
27
+ "--serverless",
28
+ action="store_true",
29
+ help="Enables extraction for Redshift Serverless",
30
+ )
26
31
  parser.set_defaults(skip_existing=False)
27
32
 
28
33
  args = parser.parse_args()
@@ -34,5 +39,6 @@ def main():
34
39
  user=args.user,
35
40
  password=args.password,
36
41
  output_directory=args.output,
42
+ serverless=args.serverless,
37
43
  skip_existing=args.skip_existing,
38
44
  )
@@ -34,6 +34,7 @@ REDSHIFT_PASSWORD = "CASTOR_REDSHIFT_PASSWORD" # noqa: S105
34
34
  REDSHIFT_HOST = "CASTOR_REDSHIFT_HOST"
35
35
  REDSHIFT_PORT = "CASTOR_REDSHIFT_PORT"
36
36
  REDSHIFT_DATABASE = "CASTOR_REDSHIFT_DATABASE"
37
+ REDSHIFT_SERVERLESS = "CASTOR_REDSHIFT_SERVERLESS"
37
38
 
38
39
 
39
40
  def _credentials(params: dict) -> dict:
@@ -48,6 +49,14 @@ def _credentials(params: dict) -> dict:
48
49
  }
49
50
 
50
51
 
52
+ def _query_builder(params: dict) -> RedshiftQueryBuilder:
53
+ env_parameter = from_env(REDSHIFT_SERVERLESS, allow_missing=True)
54
+ from_env_ = str(env_parameter).lower() == "true"
55
+ from_params_ = params.get("serverless", False)
56
+ is_serverless = from_params_ or from_env_
57
+ return RedshiftQueryBuilder(is_serverless=is_serverless)
58
+
59
+
51
60
  def extract_all(**kwargs) -> None:
52
61
  """
53
62
  Extract all assets from Redshift and store the results in CSV files
@@ -56,7 +65,7 @@ def extract_all(**kwargs) -> None:
56
65
 
57
66
  client = RedshiftClient(credentials=_credentials(kwargs))
58
67
 
59
- query_builder = RedshiftQueryBuilder()
68
+ query_builder = _query_builder(kwargs)
60
69
 
61
70
  storage = LocalStorage(directory=output_directory)
62
71
 
@@ -0,0 +1,26 @@
1
+ import pytest
2
+
3
+ from .extract import (
4
+ REDSHIFT_SERVERLESS,
5
+ _query_builder,
6
+ )
7
+
8
+
9
+ @pytest.mark.parametrize(
10
+ "serverless_param,env_param,expected",
11
+ [
12
+ (True, "False", True),
13
+ (False, "True", True),
14
+ (None, "TRUE", True),
15
+ (None, "TrUe", True),
16
+ (None, "FAlSE", False),
17
+ (None, "False", False),
18
+ (None, None, False),
19
+ (True, None, True),
20
+ ],
21
+ )
22
+ def test__query_builder(serverless_param, env_param, expected, monkeypatch):
23
+ params = {"serverless": serverless_param}
24
+ monkeypatch.setenv(REDSHIFT_SERVERLESS, env_param)
25
+
26
+ assert _query_builder(params).is_serverless == expected
@@ -0,0 +1,69 @@
1
+ WITH parameters AS (
2
+ SELECT
3
+ :day AS day_start,
4
+ :hour_min AS hour_min,
5
+ :hour_max AS hour_max
6
+ ),
7
+
8
+ queries_deduplicated AS (
9
+ SELECT DISTINCT q.query_id
10
+ FROM SYS_QUERY_HISTORY AS q
11
+ CROSS JOIN parameters AS p
12
+ WHERE TRUE
13
+ AND DATE(q.start_time) = p.day_start
14
+ AND EXTRACT('hour' FROM q.start_time) BETWEEN p.hour_min AND p.hour_max
15
+ ),
16
+
17
+ query AS (
18
+ SELECT
19
+ q.query_id,
20
+ qt.text,
21
+ qt.sequence,
22
+ COUNT(*) OVER(PARTITION BY q.query_id) AS sequence_count
23
+ FROM queries_deduplicated AS q
24
+ INNER JOIN SYS_QUERY_TEXT AS qt ON q.query_id = qt.query_id
25
+ ),
26
+
27
+ raw_query_text AS
28
+ (
29
+ SELECT
30
+ q.query_id,
31
+ LISTAGG(q.text, '') WITHIN GROUP (ORDER BY q.sequence) AS agg_text
32
+ FROM query AS q
33
+ WHERE TRUE
34
+ -- LISTAGG raises an error when total length >= 65535
35
+ -- each query text contains 4000 char max
36
+ AND q.sequence_count < (65535 / 4000)
37
+ GROUP BY q.query_id
38
+ ),
39
+
40
+ query_text AS (
41
+ SELECT
42
+ query_id,
43
+ CASE
44
+ WHEN agg_text ILIKE 'INSERT INTO%%'
45
+ THEN REGEXP_REPLACE(agg_text, 'VALUES (.*)', 'DEFAULT VALUES')
46
+ ELSE agg_text
47
+ END AS agg_text
48
+ FROM raw_query_text
49
+ )
50
+ SELECT
51
+ q.query_id::VARCHAR(256) AS query_id,
52
+ qt.agg_text::VARCHAR(60000) AS query_text,
53
+ q.database_name AS database_id,
54
+ q.database_name AS database_name,
55
+ q.session_id AS process_id,
56
+ 0 as aborted,
57
+ q.start_time AS start_time,
58
+ q.end_time AS end_time,
59
+ q.user_id AS user_id,
60
+ q.query_label,
61
+ u.usename AS user_name
62
+ FROM SYS_QUERY_HISTORY AS q
63
+ JOIN query_text AS qt ON q.query_id = qt.query_id
64
+ JOIN pg_catalog.pg_user AS u ON u.usesysid = q.user_id
65
+ CROSS JOIN parameters AS p
66
+ WHERE TRUE
67
+ AND DATE(q.start_time) = p.day_start
68
+ AND EXTRACT('hour' FROM q.start_time) BETWEEN p.hour_min AND p.hour_max
69
+ AND q.status = 'success'
@@ -15,10 +15,21 @@ class RedshiftQueryBuilder(AbstractQueryBuilder):
15
15
 
16
16
  def __init__(
17
17
  self,
18
+ is_serverless: bool = False,
18
19
  time_filter: Optional[TimeFilter] = None,
19
20
  ):
20
21
  super().__init__(time_filter=time_filter)
22
+ self.is_serverless = is_serverless
23
+
24
+ def build_query_serverless(self) -> ExtractionQuery:
25
+ """To get the query history in Redshift Serverless, we cannot use STL tables."""
26
+ statement = self._load_from_file("query_serverless.sql")
27
+ params = self._time_filter.to_dict()
28
+ return ExtractionQuery(statement, params)
21
29
 
22
30
  def build(self, asset: WarehouseAsset) -> List[ExtractionQuery]:
23
- query = self.build_default(asset)
31
+ if asset == WarehouseAsset.QUERY and self.is_serverless:
32
+ query = self.build_query_serverless()
33
+ else:
34
+ query = self.build_default(asset)
24
35
  return [query]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: castor-extractor
3
- Version: 0.20.4
3
+ Version: 0.20.5
4
4
  Summary: Extract your metadata assets.
5
5
  Home-page: https://www.castordoc.com/
6
6
  License: EULA
@@ -208,6 +208,10 @@ For any questions or bug report, contact us at [support@castordoc.com](mailto:su
208
208
 
209
209
  # Changelog
210
210
 
211
+ ## 0.20.5 - 2024-10-09
212
+
213
+ * Redshift: enable extraction from a Redshift Serverless instance
214
+
211
215
  ## 0.20.4 - 2024-10-09
212
216
 
213
217
  * Salesforce warehouse: `Labels` instead of `api_names` for columns
@@ -1,4 +1,4 @@
1
- CHANGELOG.md,sha256=CzVaQbFAS2hlZE2ak7DTYHWBNjMaC59e8UK7Q9p10tw,13905
1
+ CHANGELOG.md,sha256=gqprCxUMpvdGZdxp3AxdsRKA3JK-Q9NM72m839G5CeQ,13996
2
2
  Dockerfile,sha256=HcX5z8OpeSvkScQsN-Y7CNMUig_UB6vTMDl7uqzuLGE,303
3
3
  DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
4
4
  LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
@@ -17,7 +17,7 @@ castor_extractor/commands/extract_notion.py,sha256=uaxcF3_bT7D_-JxnIW0F7VVDphI_Z
17
17
  castor_extractor/commands/extract_postgres.py,sha256=pX0RnCPi4nw6QQ6wiAuZ_Xt3ZbDuMUG9aQKuqFgJtAU,1154
18
18
  castor_extractor/commands/extract_powerbi.py,sha256=f0G5w61KXExJ6Sw39_mJIwqQNpLorE5-LKmZXlUqvKI,783
19
19
  castor_extractor/commands/extract_qlik.py,sha256=VBe_xFKh_nR0QSFFIncAaC8yDqBeMa6VunBAga7AeGg,891
20
- castor_extractor/commands/extract_redshift.py,sha256=bdLp7d7ImZoKCkWc3f3NXF1imIzMVT43_KPI-x4UVac,1155
20
+ castor_extractor/commands/extract_redshift.py,sha256=zRBg2D_ft4GLdPSdmetRcgQVAA80DXtdRSYsQhAWIik,1334
21
21
  castor_extractor/commands/extract_salesforce.py,sha256=3j3YTmMkPAwocR-B1ozJQai0UIZPtpmAyWj-hHvdWn4,1226
22
22
  castor_extractor/commands/extract_salesforce_reporting.py,sha256=FdANTNiLkIPdm80XMYxWReHjdycLsIa61pyeCD-sUDk,962
23
23
  castor_extractor/commands/extract_sigma.py,sha256=sxewHcZ1Doq35V2qnpX_zCKKXkrb1_9bYjUMg7BOW-k,643
@@ -362,18 +362,20 @@ castor_extractor/warehouse/postgres/query.py,sha256=5QmI79BP_EjqxeABNg56rxuM9Xuu
362
362
  castor_extractor/warehouse/redshift/__init__.py,sha256=CC82SejYDlwYhZhhn40ln-oTsRx7AJ1Km61cxPkymjE,125
363
363
  castor_extractor/warehouse/redshift/client.py,sha256=My7003HGBhTpS6X5NgYcKwntR7h45scLaGr-LSY0tIc,2172
364
364
  castor_extractor/warehouse/redshift/client_test.py,sha256=74lZfna71qs80EKAuitJ8_ZjAGtpYHf5tChySinVPoQ,1023
365
- castor_extractor/warehouse/redshift/extract.py,sha256=XnAnBSapzXYUWVQKMhVaLlloA-uXEqseOhSt9flVsdw,2331
365
+ castor_extractor/warehouse/redshift/extract.py,sha256=pblUQ2XafVTpyHrDLrmwFKy55mUNF03dZvgPweihYUc,2723
366
+ castor_extractor/warehouse/redshift/extract_test.py,sha256=-8eWOsFEv4DFvBmalaE_TzQD6YdgwnGRPCkKsycJuxg,653
366
367
  castor_extractor/warehouse/redshift/queries/.sqlfluff,sha256=W4pFQiY8KMtXwn3WguYQJA8cj78VR7K-iokPoZoy5aM,30
367
368
  castor_extractor/warehouse/redshift/queries/column.sql,sha256=ZXdurmaJRD2fejDksU5eh37Q4srmnVrEjSzsrtg_il8,6840
368
369
  castor_extractor/warehouse/redshift/queries/database.sql,sha256=_C0knW159YDfReGuWLjIdvxHzefo1Xg2xw2dJKJzNk8,299
369
370
  castor_extractor/warehouse/redshift/queries/group.sql,sha256=8p0wlqllnwOTiAgiV237DvFYHGOEcYwaHdyqVQg3F6E,101
370
371
  castor_extractor/warehouse/redshift/queries/query.sql,sha256=yZNGnUdebvvDx0J0KMSJ2hNgkK4gPduyOfPM_7-DIfo,3465
372
+ castor_extractor/warehouse/redshift/queries/query_serverless.sql,sha256=QlYYFLJ2gInVczuXDxTGColM3-_zLSpPD0tBuLVFMyQ,1925
371
373
  castor_extractor/warehouse/redshift/queries/schema.sql,sha256=Mf6nooi2w2PhGxM2_kDAf3oQ8QnR-hpT5Y0AmUzghGg,585
372
374
  castor_extractor/warehouse/redshift/queries/table.sql,sha256=y8CGOwPHH_Mr8g1Zvuz2U5ldL8zuPm5v3M5RPZqIhsE,2645
373
375
  castor_extractor/warehouse/redshift/queries/table_freshness.sql,sha256=l61_ysmTEtuMwK9RmYmD5cu0HmD1RXwTEhX0ytBeyxg,726
374
376
  castor_extractor/warehouse/redshift/queries/user.sql,sha256=sEXveJAuNvZacvpI6WfwsX6VavoMb2VqYA32f6Dt-_Y,170
375
377
  castor_extractor/warehouse/redshift/queries/view_ddl.sql,sha256=Pkyh_QT6d4rhTeyiVcqw6O8CRl7NEhk2p7eM5YIn5kg,719
376
- castor_extractor/warehouse/redshift/query.py,sha256=0C81rkt2cpkWrJIxxwALDyqr-49vlqQM04y_N6wwStc,540
378
+ castor_extractor/warehouse/redshift/query.py,sha256=F2MiFqPRNGfBrCtkXNRs28Q_i9DfIEKh93yDUVb8Yjw,1060
377
379
  castor_extractor/warehouse/salesforce/__init__.py,sha256=NR4aNea5jeE1xYqeZ_29deeN84CkN0_D_Z7CLQdJvFY,137
378
380
  castor_extractor/warehouse/salesforce/client.py,sha256=-9WHcQwEMrpGRQ9CN-bsRSR2Tnx9d-f_FtV4ntsf71w,3287
379
381
  castor_extractor/warehouse/salesforce/constants.py,sha256=GusduVBCPvwpk_Im6F3bDvXeNQ7hRnCMdIAjIg65RnE,52
@@ -413,8 +415,8 @@ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=kbBQP-TdG5px1IVgyx
413
415
  castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
414
416
  castor_extractor/warehouse/sqlserver/query.py,sha256=j_d5-HMnzBouwGfywVZMRSSwbXzPvzDWlFCZmvxcoGQ,539
415
417
  castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
416
- castor_extractor-0.20.4.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
417
- castor_extractor-0.20.4.dist-info/METADATA,sha256=YcFx5O-gccq_JevTWl9xfeE5LGf5baiUKHfPrG1QX28,21123
418
- castor_extractor-0.20.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
419
- castor_extractor-0.20.4.dist-info/entry_points.txt,sha256=IVGy_oM8VjzADMAxzmiNJTYYidTCsI98MpO_mkXjkqE,1573
420
- castor_extractor-0.20.4.dist-info/RECORD,,
418
+ castor_extractor-0.20.5.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
419
+ castor_extractor-0.20.5.dist-info/METADATA,sha256=wmQH2GPtnBGHq8a5CdjKZ6yPMNIrS4gkYrTpbV0T9yg,21214
420
+ castor_extractor-0.20.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
421
+ castor_extractor-0.20.5.dist-info/entry_points.txt,sha256=IVGy_oM8VjzADMAxzmiNJTYYidTCsI98MpO_mkXjkqE,1573
422
+ castor_extractor-0.20.5.dist-info/RECORD,,