perspective-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perspective/__init__.py +1 -0
- perspective/config.py +240 -0
- perspective/exceptions.py +15 -0
- perspective/ingest/dbt.py +150 -0
- perspective/ingest/ingest.py +164 -0
- perspective/ingest/postgres.py +388 -0
- perspective/ingest/sources/bi/powerbi/extract.py +184 -0
- perspective/ingest/sources/bi/powerbi/models.py +137 -0
- perspective/ingest/sources/bi/powerbi/pipeline.py +29 -0
- perspective/ingest/sources/bi/powerbi/transform.py +478 -0
- perspective/ingest/sources/bi/qlik_sense/extract.py +297 -0
- perspective/ingest/sources/bi/qlik_sense/models.py +22 -0
- perspective/ingest/sources/bi/qlik_sense/pipeline.py +19 -0
- perspective/ingest/sources/bi/qlik_sense/transform.py +76 -0
- perspective/ingest/sources/database/sap/extract.py +253 -0
- perspective/ingest/sources/database/sap/pipeline.py +23 -0
- perspective/ingest/sources/database/sap/transform.py +85 -0
- perspective/main.py +74 -0
- perspective/models/configs.py +422 -0
- perspective/models/dashboards.py +44 -0
- perspective/models/databases.py +26 -0
- perspective/utils/__init__.py +3 -0
- perspective/utils/options.py +77 -0
- perspective/utils/utils.py +274 -0
- perspective_cli-0.1.0.dist-info/METADATA +49 -0
- perspective_cli-0.1.0.dist-info/RECORD +29 -0
- perspective_cli-0.1.0.dist-info/WHEEL +5 -0
- perspective_cli-0.1.0.dist-info/entry_points.txt +2 -0
- perspective_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""Playground for Qlik Sense APIs exploration.
|
|
2
|
+
|
|
3
|
+
We connect to two APIs:
|
|
4
|
+
- Qlik Sense Repository Service (QRS) API
|
|
5
|
+
- Qlik Engine JSON API
|
|
6
|
+
|
|
7
|
+
We connect using the default Virtual Proxy Service (built into Qlik Sense Enterprise
|
|
8
|
+
default configuration, so no server-side configuration is required).
|
|
9
|
+
|
|
10
|
+
For auth, for the time being we use the Windows login method (NTLM), combined with the
|
|
11
|
+
"header" method of authenticating to these APIs.
|
|
12
|
+
|
|
13
|
+
In the case of the QRS API, we use the requests library to make HTTP requests, and use
|
|
14
|
+
the requests_ntlm library to handle NTLM authentication.
|
|
15
|
+
|
|
16
|
+
For the Qlik Engine JSON API, we use the websocket library to create a WebSocket
|
|
17
|
+
connection. To authenticate, we use a Qlik session ID that we get from calling a
|
|
18
|
+
QRS API endpoint (the endpoint doesn't matter, we only care that the server returns
|
|
19
|
+
a session ID in its response) using the method described above.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from collections.abc import Generator
|
|
23
|
+
import json
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
import secrets
|
|
26
|
+
import ssl
|
|
27
|
+
import string
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
import dlt
|
|
31
|
+
from dlt.extract.resource import DltResource
|
|
32
|
+
from loguru import logger
|
|
33
|
+
import requests
|
|
34
|
+
from requests_ntlm import HttpNtlmAuth
|
|
35
|
+
import websocket
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# # Disable SSL warnings (optional but recommended for self-signed certs)
|
|
39
|
+
requests.packages.urllib3.disable_warnings()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_qrs_session(ntlm_username: str, ntlm_password: str) -> requests.Session:
|
|
43
|
+
"""Create and configure a requests Session for QRS API."""
|
|
44
|
+
session = requests.Session()
|
|
45
|
+
# Probably a typo of xsrf; anyway, this is a random string of 16 alphanumeric chars.
|
|
46
|
+
alphabet = string.ascii_uppercase + string.ascii_lowercase + string.digits
|
|
47
|
+
xrfkey = "".join(secrets.choice(alphabet) for _ in range(16))
|
|
48
|
+
headers = {
|
|
49
|
+
"x-qlik-xrfkey": xrfkey,
|
|
50
|
+
"User-Agent": "Windows",
|
|
51
|
+
}
|
|
52
|
+
session.auth = HttpNtlmAuth(ntlm_username, ntlm_password)
|
|
53
|
+
session.headers = headers
|
|
54
|
+
session.params = {"xrfkey": xrfkey}
|
|
55
|
+
session.verify = False
|
|
56
|
+
return session
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_qlik_session_id(
|
|
60
|
+
qrs_api_base_url: str, ntlm_username: str, ntlm_password: str
|
|
61
|
+
) -> str:
|
|
62
|
+
"""Retrieve Qlik session ID from the QRS API.
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
ValueError: If session ID could not be retrieved.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
str: The Qlik session ID.
|
|
69
|
+
"""
|
|
70
|
+
session = get_qrs_session(ntlm_username=ntlm_username, ntlm_password=ntlm_password)
|
|
71
|
+
|
|
72
|
+
logger.debug("Retrieving Qlik session ID...")
|
|
73
|
+
|
|
74
|
+
full_url = qrs_api_base_url + "/about"
|
|
75
|
+
response = session.get(full_url)
|
|
76
|
+
response.raise_for_status()
|
|
77
|
+
session_id = session.cookies.get("X-Qlik-Session")
|
|
78
|
+
|
|
79
|
+
if not session_id:
|
|
80
|
+
msg = "Could not retrieve Qlik session ID. Perhaps Qlik API's logic changed?"
|
|
81
|
+
raise ValueError(msg)
|
|
82
|
+
|
|
83
|
+
logger.debug(f"Retrieved Qlik session ID: {session_id}.")
|
|
84
|
+
return session_id
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_socket(engine_api_url: str, session_id: str) -> websocket.WebSocket:
|
|
88
|
+
"""Create a WebSocket connection to the Qlik Engine JSON API.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
engine_api_url (str): The URL of the Qlik Engine JSON API.
|
|
92
|
+
session_id (str): The ID of the Qlik session to use.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
websocket.WebSocket: An authenticated socket connection.
|
|
96
|
+
"""
|
|
97
|
+
# Get a Qlik session ID from QRS API.
|
|
98
|
+
# This way, we can (indirectly) use header auth in the Qlik Engine JSON API.
|
|
99
|
+
headers = {"Cookie": f"X-Qlik-Session={session_id}"}
|
|
100
|
+
|
|
101
|
+
# Connect to the Qlik Engine JSON API.
|
|
102
|
+
logger.info("Creating a socket...")
|
|
103
|
+
|
|
104
|
+
socket = websocket.WebSocket(sslopt={"cert_reqs": ssl.CERT_NONE})
|
|
105
|
+
socket.connect(engine_api_url, header=headers)
|
|
106
|
+
on_authentication_msg = json.loads(socket.recv())
|
|
107
|
+
logger.debug("On authentication message: ", on_authentication_msg)
|
|
108
|
+
if on_authentication_msg["params"]["mustAuthenticate"]:
|
|
109
|
+
msg = "Could not authenticate to Qlik Engine JSON API."
|
|
110
|
+
msg += " Please ensure you're authenticated and provide a valid session ID."
|
|
111
|
+
raise ValueError(msg)
|
|
112
|
+
|
|
113
|
+
on_connected_msg = socket.recv()
|
|
114
|
+
logger.debug("On connected message: ", on_connected_msg)
|
|
115
|
+
|
|
116
|
+
return socket
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get(
|
|
120
|
+
request: dict[str, Any],
|
|
121
|
+
engine_api_url: str,
|
|
122
|
+
session_id: str | None = None,
|
|
123
|
+
socket: websocket.WebSocket | None = None,
|
|
124
|
+
) -> dict[str, Any]:
|
|
125
|
+
"""Retrieve a response from the Qlik Engine JSON API using WebSocket.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
request (dict[str, Any]): The request body.
|
|
129
|
+
engine_api_url (str): The URL of the Qlik Engine JSON API.
|
|
130
|
+
session_id (str | None, optional): The ID of the Qlik session to use. Defaults
|
|
131
|
+
to None.
|
|
132
|
+
socket (websocket.WebSocket | None, optional): The socket to use. Defaults to
|
|
133
|
+
None.
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
ValueError: If neither a session nor a socket is provided, or if the socket is
|
|
137
|
+
specified, but without its session.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
dict[str, Any] | None: The response from Qlik Engine JSON API.
|
|
141
|
+
"""
|
|
142
|
+
if not session_id and not socket:
|
|
143
|
+
msg = "Either `socket` or `session_id` must be provided."
|
|
144
|
+
raise ValueError(msg)
|
|
145
|
+
|
|
146
|
+
is_socket_externally_managed = bool(socket)
|
|
147
|
+
if not is_socket_externally_managed:
|
|
148
|
+
socket = get_socket(engine_api_url, session_id=session_id)
|
|
149
|
+
|
|
150
|
+
logger.info(f"Calling '{request['method']}' method...")
|
|
151
|
+
socket.send(json.dumps(request))
|
|
152
|
+
response = json.loads(socket.recv())
|
|
153
|
+
|
|
154
|
+
if not is_socket_externally_managed:
|
|
155
|
+
socket.close()
|
|
156
|
+
|
|
157
|
+
if machine_readable_error := response.get("error"):
|
|
158
|
+
error_code = machine_readable_error["code"]
|
|
159
|
+
error_message_short = machine_readable_error["message"]
|
|
160
|
+
error_message_long = machine_readable_error["parameter"]
|
|
161
|
+
human_readable_error = (
|
|
162
|
+
f"Error {error_code} ('{error_message_short}'). " + error_message_long + "."
|
|
163
|
+
)
|
|
164
|
+
raise ValueError(human_readable_error)
|
|
165
|
+
|
|
166
|
+
return response["result"]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@dlt.source
|
|
170
|
+
def qlik_sense(
|
|
171
|
+
qrs_api_base_url: str = dlt.secrets.value,
|
|
172
|
+
engine_api_url: str = dlt.secrets.value,
|
|
173
|
+
ntlm_username: str = dlt.secrets.value,
|
|
174
|
+
ntlm_password: str = dlt.secrets.value,
|
|
175
|
+
) -> list[DltResource]:
|
|
176
|
+
"""The Qlik Sense metadata source."""
|
|
177
|
+
# We use a single Qlik session for all requests.
|
|
178
|
+
qlik_session_id = get_qlik_session_id(
|
|
179
|
+
qrs_api_base_url, ntlm_username=ntlm_username, ntlm_password=ntlm_password
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
@dlt.resource(primary_key="qDocId", write_disposition="merge")
|
|
183
|
+
def apps_engine(
|
|
184
|
+
modified_at: dlt.sources.incremental = dlt.sources.incremental( # noqa: B008
|
|
185
|
+
"modifiedDate", initial_value="2024-01-01T00:00:00Z"
|
|
186
|
+
),
|
|
187
|
+
) -> DltResource:
|
|
188
|
+
"""Get metadata about all apps in Qlik Sense from the Engine JSON API."""
|
|
189
|
+
query = {
|
|
190
|
+
"jsonrpc": "2.0",
|
|
191
|
+
# "id": 1,
|
|
192
|
+
"method": "GetDocList",
|
|
193
|
+
"handle": -1,
|
|
194
|
+
"params": [],
|
|
195
|
+
}
|
|
196
|
+
apps_info_nested = get(
|
|
197
|
+
query, session_id=qlik_session_id, engine_api_url=engine_api_url
|
|
198
|
+
)
|
|
199
|
+
# Unnest and return only the modified apps.
|
|
200
|
+
yield from [
|
|
201
|
+
{
|
|
202
|
+
"qDocId": app["qDocId"],
|
|
203
|
+
"qDocName": app["qDocName"],
|
|
204
|
+
"description": app["qMeta"]["description"],
|
|
205
|
+
"createdDate": app["qMeta"]["createdDate"],
|
|
206
|
+
"modifiedDate": app["qMeta"]["modifiedDate"],
|
|
207
|
+
"stream": {
|
|
208
|
+
"id": app["qMeta"]["stream"]["id"]
|
|
209
|
+
if app["qMeta"].get("stream")
|
|
210
|
+
else None,
|
|
211
|
+
"name": app["qMeta"]["stream"]["name"]
|
|
212
|
+
if app["qMeta"].get("stream")
|
|
213
|
+
else None,
|
|
214
|
+
},
|
|
215
|
+
}
|
|
216
|
+
for app in apps_info_nested["qDocList"]
|
|
217
|
+
if app["qMeta"]["modifiedDate"] > modified_at.start_value
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
@dlt.transformer(data_from=apps_engine, primary_key="id", write_disposition="merge")
|
|
221
|
+
def app_details_qrs(app: dict[str, Any]) -> Generator[dict[str, Any], None, None]:
|
|
222
|
+
app_id = app["qDocId"]
|
|
223
|
+
|
|
224
|
+
logger.debug(f"Retrieving app {app_id} details from QRS API...")
|
|
225
|
+
|
|
226
|
+
session = get_qrs_session(
|
|
227
|
+
ntlm_username=ntlm_username, ntlm_password=ntlm_password
|
|
228
|
+
)
|
|
229
|
+
full_url = qrs_api_base_url + f"/app/{app_id}"
|
|
230
|
+
response = session.get(full_url)
|
|
231
|
+
response.raise_for_status()
|
|
232
|
+
yield response.json()
|
|
233
|
+
|
|
234
|
+
@dlt.transformer(data_from=apps_engine, primary_key="id", write_disposition="merge")
|
|
235
|
+
def app_details_engine(
|
|
236
|
+
app: dict[str, Any],
|
|
237
|
+
) -> Generator[dict[str, Any], None, None]:
|
|
238
|
+
app_id = app["qDocId"]
|
|
239
|
+
# These two requests are connected, so we need to perform them on the same
|
|
240
|
+
# socket.
|
|
241
|
+
socket = get_socket(engine_api_url, session_id=qlik_session_id)
|
|
242
|
+
open_app_query = {
|
|
243
|
+
"jsonrpc": "2.0",
|
|
244
|
+
"id": 1,
|
|
245
|
+
"method": "OpenDoc",
|
|
246
|
+
"handle": -1,
|
|
247
|
+
"params": {"qDocName": app_id},
|
|
248
|
+
}
|
|
249
|
+
lineage_query = {
|
|
250
|
+
"jsonrpc": "2.0",
|
|
251
|
+
"id": 2,
|
|
252
|
+
"method": "GetLineage",
|
|
253
|
+
"handle": 1,
|
|
254
|
+
"params": {},
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
logger.debug(f"Retrieving app {app_id} details from Qlik Engine JSON API...")
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
_ = get(open_app_query, socket=socket, engine_api_url=engine_api_url)
|
|
261
|
+
lineage = get(lineage_query, socket=socket, engine_api_url=engine_api_url)[
|
|
262
|
+
"qLineage"
|
|
263
|
+
]
|
|
264
|
+
except Exception as e:
|
|
265
|
+
msg = (
|
|
266
|
+
f"Failed retrieving lineage for app: {app['qDocName']} (ID: {app_id})."
|
|
267
|
+
)
|
|
268
|
+
logger.exception(e, msg=msg)
|
|
269
|
+
finally:
|
|
270
|
+
socket.close()
|
|
271
|
+
|
|
272
|
+
# NOTE: We enrich the response with the app ID so that it can be handled
|
|
273
|
+
# properly.
|
|
274
|
+
yield {"id": app_id, "lineage": lineage}
|
|
275
|
+
|
|
276
|
+
return [apps_engine, app_details_qrs, app_details_engine]
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
if __name__ == "__main__":
|
|
280
|
+
# source = qlik_sense().add_limit(2)
|
|
281
|
+
from loguru import logger
|
|
282
|
+
|
|
283
|
+
source = qlik_sense()
|
|
284
|
+
# for apps_engine, app_details_qrs, app_details_engine in source:
|
|
285
|
+
# print(json.dumps(app_details_qrs, indent=4))
|
|
286
|
+
# print("-" * 30)
|
|
287
|
+
# print(json.dumps(app_details_engine, indent=4))
|
|
288
|
+
app_details_qrs = list(source.with_resources("app_details_qrs"))
|
|
289
|
+
app_details_engine = list(source.with_resources("app_details_engine"))
|
|
290
|
+
logger.info(f"{len(app_details_qrs)} apps in QRS API.")
|
|
291
|
+
logger.info(f"{len(app_details_engine)} apps in Engine JSON API.")
|
|
292
|
+
|
|
293
|
+
with Path("app_details_qrs.json").open("w", encoding="utf-8") as f:
|
|
294
|
+
json.dump(app_details_qrs, f, indent=4)
|
|
295
|
+
|
|
296
|
+
with Path("app_details_engine.json").open("w", encoding="utf-8") as f:
|
|
297
|
+
json.dump(app_details_engine, f, indent=4)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Pydantic models for Qlik Sense APIs responses."""
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, EmailStr
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AppDetailsQRS(BaseModel):
|
|
7
|
+
id: str
|
|
8
|
+
name: str
|
|
9
|
+
description: str | None
|
|
10
|
+
owner: EmailStr | None
|
|
11
|
+
created_at: str
|
|
12
|
+
updated_at: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Table(BaseModel):
|
|
16
|
+
qDiscriminator: str
|
|
17
|
+
qStatement: str | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AppDetailsEngine(BaseModel):
|
|
21
|
+
app_id: str
|
|
22
|
+
lineage: list[Table]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""dlt pipeline to load Qlik Sense metadata into DuckDB."""
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
|
|
5
|
+
from perspective.ingest.sources.bi.qlik_sense.extract import qlik_sense
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
pipe = dlt.pipeline(
|
|
9
|
+
pipeline_name="qlik_to_duckdb",
|
|
10
|
+
destination=dlt.destinations.duckdb("db.duckdb"),
|
|
11
|
+
dataset_name="qlik_sense",
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
if __name__ == "__main__":
|
|
15
|
+
from loguru import logger
|
|
16
|
+
|
|
17
|
+
# load_package = pipe.run(qlik().add_limit(5), refresh="drop_data") # For testing.
|
|
18
|
+
load_package = pipe.run(qlik_sense().add_limit(5))
|
|
19
|
+
logger.info(load_package)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Transform Qlik Sense metadata into Perspective DashboardManifest format."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from perspective.ingest.sources.bi.qlik_sense.models import (
|
|
7
|
+
AppDetailsEngine,
|
|
8
|
+
AppDetailsQRS,
|
|
9
|
+
)
|
|
10
|
+
from perspective.models.dashboards import (
|
|
11
|
+
Dashboard,
|
|
12
|
+
DashboardManifest,
|
|
13
|
+
DashboardSchemaMetadata,
|
|
14
|
+
DataModel,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def transform(
|
|
19
|
+
app_details_qrs: AppDetailsQRS, app_details_engine: AppDetailsEngine
|
|
20
|
+
) -> DashboardManifest:
|
|
21
|
+
# Extract tables from the Qlik Sense metadata.
|
|
22
|
+
# tables = extract_tables(workspace_info)
|
|
23
|
+
# reports = extract_reports(workspace_info, tables=tables)
|
|
24
|
+
|
|
25
|
+
# return DashboardManifest(
|
|
26
|
+
# metadata=DashboardSchemaMetadata(schema="dashboard", version=1),
|
|
27
|
+
# payload=reports,
|
|
28
|
+
# )
|
|
29
|
+
"""Join app metadata from the two sources: QRS API and Engine JSON API.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
app_details_qrs (list[AppDetailsQRS]): List of app details from QRS API.
|
|
33
|
+
app_details_engine (list[AppDetailsEngine]): List of app details from Engine
|
|
34
|
+
JSON API.
|
|
35
|
+
"""
|
|
36
|
+
apps = []
|
|
37
|
+
for app in app_details_qrs:
|
|
38
|
+
app_parsed = {
|
|
39
|
+
"external_id": app["id"],
|
|
40
|
+
"url": f"{app['resourceId']}/hub/{app['id']}",
|
|
41
|
+
"type": "qliksense",
|
|
42
|
+
"name": app["name"],
|
|
43
|
+
"workspace": app.get("stream", {}).get("name", "My Work"),
|
|
44
|
+
"created_at": app.get("createdDate"),
|
|
45
|
+
"modified_at": app.get("modifiedDate"),
|
|
46
|
+
"owners": [
|
|
47
|
+
{
|
|
48
|
+
"user_id": app.get("owner", {}).get("userId", ""),
|
|
49
|
+
"username": app.get("owner", {}).get("userDirectory", ""),
|
|
50
|
+
"name": app.get("owner", {}).get("name", ""),
|
|
51
|
+
}
|
|
52
|
+
],
|
|
53
|
+
"parent_models": [],
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
apps.append(app_parsed)
|
|
57
|
+
|
|
58
|
+
return DashboardManifest(
|
|
59
|
+
metadata=DashboardSchemaMetadata(schema="dashboard", version=1),
|
|
60
|
+
payload=apps,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
if __name__ == "__main__":
|
|
65
|
+
from loguru import logger
|
|
66
|
+
|
|
67
|
+
with Path("app_details_qrs.json").open(encoding="utf-8") as f:
|
|
68
|
+
app_details_qrs = json.load(f)
|
|
69
|
+
|
|
70
|
+
with Path("app_details_engine.json").open(encoding="utf-8") as f:
|
|
71
|
+
app_details_engine = json.load(f)
|
|
72
|
+
|
|
73
|
+
dashboard_manifest = transform(
|
|
74
|
+
app_details_qrs=app_details_qrs, app_details_engine=app_details_engine
|
|
75
|
+
)
|
|
76
|
+
logger.info(dashboard_manifest)
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import dlt
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
import pyrfc
|
|
6
|
+
from pyrfc._exception import (
|
|
7
|
+
ABAPApplicationError,
|
|
8
|
+
ABAPRuntimeError,
|
|
9
|
+
CommunicationError,
|
|
10
|
+
)
|
|
11
|
+
except ModuleNotFoundError as e:
|
|
12
|
+
msg = "The 'pyrfc' package is required to use the SAPRFC source. "
|
|
13
|
+
raise ImportError(msg) from e
|
|
14
|
+
from itertools import batched
|
|
15
|
+
import string
|
|
16
|
+
import textwrap
|
|
17
|
+
|
|
18
|
+
from loguru import logger
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dlt.source(name="sap")
|
|
22
|
+
def sap(
|
|
23
|
+
ashost: str = dlt.secrets.value,
|
|
24
|
+
sysnr: str = dlt.secrets.value,
|
|
25
|
+
username: str = dlt.secrets.value,
|
|
26
|
+
passwd: str = dlt.secrets.value,
|
|
27
|
+
):
|
|
28
|
+
"""Query SAP with SQL using the RFC protocol.
|
|
29
|
+
|
|
30
|
+
Use the RFC_READ_TABLE to read SAP table metadata.
|
|
31
|
+
"""
|
|
32
|
+
delimiter = "\t"
|
|
33
|
+
|
|
34
|
+
# Test the connection.
|
|
35
|
+
con = pyrfc.Connection(
|
|
36
|
+
ashost=ashost,
|
|
37
|
+
sysnr=sysnr,
|
|
38
|
+
user=username,
|
|
39
|
+
passwd=passwd,
|
|
40
|
+
)
|
|
41
|
+
logger.info("Checking the connection...")
|
|
42
|
+
try:
|
|
43
|
+
con.ping()
|
|
44
|
+
logger.info("Connection successful.")
|
|
45
|
+
except Exception as e:
|
|
46
|
+
logger.exception("Connection to SAP failed.")
|
|
47
|
+
raise
|
|
48
|
+
finally:
|
|
49
|
+
con.close()
|
|
50
|
+
|
|
51
|
+
# By convention, custom tables use a "Z" or "Y" prefix in their name.
|
|
52
|
+
custom_table_prefixes = ["Z", "Y"]
|
|
53
|
+
standard_table_prefixes = [
|
|
54
|
+
char for char in string.ascii_uppercase if char not in custom_table_prefixes
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
def get_response(
|
|
58
|
+
params,
|
|
59
|
+
func: str = "RFC_READ_TABLE",
|
|
60
|
+
data_key: str = "DATA",
|
|
61
|
+
fallback_data_key: str | None = None,
|
|
62
|
+
record_key: str = "WA",
|
|
63
|
+
fallback_record_key: str | None = None,
|
|
64
|
+
):
|
|
65
|
+
"""Call the RFC function with the given parameters."""
|
|
66
|
+
con = pyrfc.Connection(
|
|
67
|
+
ashost=ashost,
|
|
68
|
+
sysnr=sysnr,
|
|
69
|
+
user=username,
|
|
70
|
+
passwd=passwd,
|
|
71
|
+
)
|
|
72
|
+
try:
|
|
73
|
+
response = con.call(func, **params)
|
|
74
|
+
except (ABAPApplicationError, ABAPRuntimeError, CommunicationError) as e:
|
|
75
|
+
if e.key == "DATA_BUFFER_EXCEEDED":
|
|
76
|
+
msg = "Character limit per row exceeded. Please select fewer columns."
|
|
77
|
+
elif e.key == "TSV_TNEW_PAGE_ALLOC_FAILED":
|
|
78
|
+
msg = "Memory allocation failed; try a smaller batch size."
|
|
79
|
+
else:
|
|
80
|
+
msg = f"Error while calling {func} with params:\n{params}."
|
|
81
|
+
logger.exception(msg)
|
|
82
|
+
raise
|
|
83
|
+
finally:
|
|
84
|
+
con.close()
|
|
85
|
+
|
|
86
|
+
# Process the response into records.
|
|
87
|
+
data_raw = response.get(data_key) or response.get(fallback_data_key)
|
|
88
|
+
data = [
|
|
89
|
+
[
|
|
90
|
+
value.strip()
|
|
91
|
+
for value in row.get(record_key, row.get(fallback_record_key)).split(
|
|
92
|
+
delimiter
|
|
93
|
+
)
|
|
94
|
+
]
|
|
95
|
+
for row in data_raw
|
|
96
|
+
]
|
|
97
|
+
logger.info(f"Retrieved {len(data)} rows.")
|
|
98
|
+
columns = params.get("FIELDS")
|
|
99
|
+
if columns:
|
|
100
|
+
return [dict(zip(columns, row)) for row in data]
|
|
101
|
+
return data
|
|
102
|
+
|
|
103
|
+
def get_table_schema(table_prefixes: list[str]):
|
|
104
|
+
conditions = [f"TABNAME LIKE '{prefix}%'" for prefix in table_prefixes]
|
|
105
|
+
# Each line in OPTIONS must be ≤72 characters.
|
|
106
|
+
grouped = textwrap.wrap(" OR ".join(conditions), width=72)
|
|
107
|
+
options = [{"TEXT": line} for line in grouped]
|
|
108
|
+
# Filter out internal SAP objects.
|
|
109
|
+
options += [{"TEXT": " AND TABCLASS = 'TRANSP'"}]
|
|
110
|
+
params = {
|
|
111
|
+
"QUERY_TABLE": "DD02L",
|
|
112
|
+
"FIELDS": ["TABNAME"],
|
|
113
|
+
"OPTIONS": options,
|
|
114
|
+
"DELIMITER": delimiter,
|
|
115
|
+
}
|
|
116
|
+
yield get_response(params)
|
|
117
|
+
|
|
118
|
+
@dlt.resource(
|
|
119
|
+
name="standard_tables", write_disposition="merge", primary_key="TABNAME"
|
|
120
|
+
)
|
|
121
|
+
def standard_tables():
|
|
122
|
+
"""Get a list of standard SAP tables."""
|
|
123
|
+
yield get_table_schema(standard_table_prefixes)
|
|
124
|
+
|
|
125
|
+
@dlt.resource(write_disposition="merge", primary_key="TABNAME")
|
|
126
|
+
def custom_tables():
|
|
127
|
+
"""Get a list of custom SAP tables."""
|
|
128
|
+
yield get_table_schema(custom_table_prefixes)
|
|
129
|
+
|
|
130
|
+
def get_table_details(table_names):
|
|
131
|
+
"""Get metadata about SAP tables."""
|
|
132
|
+
table_batches = batched(table_names, 1000)
|
|
133
|
+
for batch_number, batch in enumerate(table_batches, start=1):
|
|
134
|
+
conditions = [f"TABNAME = '{table}'" for table in batch]
|
|
135
|
+
# Each line in OPTIONS must be ≤72 characters.
|
|
136
|
+
grouped = textwrap.wrap(" OR ".join(conditions), width=72)
|
|
137
|
+
options = [{"TEXT": line} for line in grouped]
|
|
138
|
+
params = {
|
|
139
|
+
"QUERY_TABLE": "DD03L",
|
|
140
|
+
"FIELDS": ["TABNAME", "FIELDNAME", "DATATYPE", "LENG"],
|
|
141
|
+
"OPTIONS": options,
|
|
142
|
+
"DELIMITER": delimiter,
|
|
143
|
+
}
|
|
144
|
+
logger.info(f"Extracting table batch number {batch_number}...")
|
|
145
|
+
yield get_response(params)
|
|
146
|
+
|
|
147
|
+
@dlt.transformer(
|
|
148
|
+
data_from=standard_tables,
|
|
149
|
+
write_disposition="merge",
|
|
150
|
+
primary_key=("TABNAME", "FIELDNAME"),
|
|
151
|
+
)
|
|
152
|
+
def standard_tables_details(standard_tables):
|
|
153
|
+
standard_table_names = [row["TABNAME"] for row in standard_tables]
|
|
154
|
+
yield get_table_details(standard_table_names)
|
|
155
|
+
|
|
156
|
+
@dlt.transformer(
|
|
157
|
+
data_from=custom_tables,
|
|
158
|
+
write_disposition="merge",
|
|
159
|
+
primary_key=("TABNAME", "FIELDNAME"),
|
|
160
|
+
)
|
|
161
|
+
def custom_tables_details(custom_tables):
|
|
162
|
+
custom_table_names = [row["TABNAME"] for row in custom_tables]
|
|
163
|
+
yield get_table_details(custom_table_names)
|
|
164
|
+
|
|
165
|
+
@dlt.resource(write_disposition="merge", primary_key="ROLLNAME")
|
|
166
|
+
def column_details():
|
|
167
|
+
"""Get metadata about table columns in SAP tables."""
|
|
168
|
+
params = {
|
|
169
|
+
"QUERY_TABLE": "DD04T",
|
|
170
|
+
"FIELDS": ["ROLLNAME", "DDTEXT"],
|
|
171
|
+
"OPTIONS": [{"TEXT": "DDLANGUAGE = 'EN'"}],
|
|
172
|
+
"DELIMITER": delimiter,
|
|
173
|
+
}
|
|
174
|
+
logger.info("Extracting table columns...")
|
|
175
|
+
yield get_response(params)
|
|
176
|
+
|
|
177
|
+
@dlt.resource(write_disposition="replace")
|
|
178
|
+
def abap_programs():
|
|
179
|
+
"""List all ABAP programs (reports, includes, module pools, etc.)."""
|
|
180
|
+
# Iterate 5 letters at a time as SAP can't handle large queries,
|
|
181
|
+
# and it seems there can be millions of ABAP programs.
|
|
182
|
+
logger.info("Extracting ABAP programs...")
|
|
183
|
+
for prefixes_group in batched(string.ascii_uppercase, 5):
|
|
184
|
+
logger.info(
|
|
185
|
+
f"Extracting programs with the following prefixes: {prefixes_group}..."
|
|
186
|
+
)
|
|
187
|
+
conditions = [f"NAME LIKE '{prefix}%'" for prefix in prefixes_group]
|
|
188
|
+
|
|
189
|
+
# Each line in OPTIONS must be ≤72 characters.
|
|
190
|
+
grouped = textwrap.wrap(" OR ".join(conditions), width=72)
|
|
191
|
+
options = [{"TEXT": line} for line in grouped]
|
|
192
|
+
# '1' = Executable report
|
|
193
|
+
# 'I' = Include
|
|
194
|
+
# 'M' = Module pool (dialog program)
|
|
195
|
+
# 'F' = Function group main program (needed for function modules)
|
|
196
|
+
# 'K' = Class pool (needed for global classes)
|
|
197
|
+
# 'J' = Interface pool
|
|
198
|
+
# 'S' = Subroutine pool
|
|
199
|
+
options += [{"TEXT": " AND SUBC IN ('1','I','M','F','K','J','S')"}]
|
|
200
|
+
# NOTE: all these filters only remove ~5% of records.
|
|
201
|
+
params = {
|
|
202
|
+
"QUERY_TABLE": "TRDIR",
|
|
203
|
+
"FIELDS": ["NAME"],
|
|
204
|
+
"OPTIONS": options,
|
|
205
|
+
"DELIMITER": delimiter,
|
|
206
|
+
}
|
|
207
|
+
yield get_response(params)
|
|
208
|
+
|
|
209
|
+
@dlt.transformer(
|
|
210
|
+
data_from=abap_programs, write_disposition="merge", primary_key="name"
|
|
211
|
+
)
|
|
212
|
+
def abap_programs_source_code(abap_programs):
|
|
213
|
+
logger.info("Extracting the source code of ABAP programs...")
|
|
214
|
+
program_names = (row["NAME"] for row in abap_programs)
|
|
215
|
+
for program in program_names:
|
|
216
|
+
params = {"PROGRAM_NAME": program}
|
|
217
|
+
logger.info(f"Extracting source for ABAP program: {program}...")
|
|
218
|
+
source_code_lines = get_response(
|
|
219
|
+
params,
|
|
220
|
+
func="RPY_PROGRAM_READ",
|
|
221
|
+
data_key="SOURCE",
|
|
222
|
+
fallback_data_key="SOURCE_EXTENDED",
|
|
223
|
+
fallback_record_key="LINE",
|
|
224
|
+
)
|
|
225
|
+
yield {
|
|
226
|
+
"name": program,
|
|
227
|
+
"source": "\n".join([line[0] for line in source_code_lines]),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
@dlt.resource(write_disposition="replace")
|
|
231
|
+
def abap_transactions():
|
|
232
|
+
pass
|
|
233
|
+
|
|
234
|
+
return [
|
|
235
|
+
standard_tables,
|
|
236
|
+
standard_tables_details,
|
|
237
|
+
custom_tables,
|
|
238
|
+
custom_tables_details,
|
|
239
|
+
column_details,
|
|
240
|
+
abap_programs,
|
|
241
|
+
abap_programs_source_code,
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
if __name__ == "__main__":
|
|
246
|
+
pipeline = dlt.pipeline(
|
|
247
|
+
pipeline_name="sap", destination="duckdb", dataset_name="bronze"
|
|
248
|
+
)
|
|
249
|
+
# pipeline.run(sap.with_resources("column_details", "custom_tables_details"))
|
|
250
|
+
# pipeline.run(sap().with_resources("column_details", "standard_tables_details"))
|
|
251
|
+
pipeline.run(sap().with_resources("abap_programs_source_code"))
|
|
252
|
+
# pipeline.run(sap().add_limit(1).with_resources("abap_programs"))
|
|
253
|
+
# pipeline.run(sap().with_resources("abap_programs"))
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import Generator
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
|
|
5
|
+
from lumaCLI.metadata.models.database import DatabaseTableManifest
|
|
6
|
+
from lumaCLI.metadata.sources.sap.extract import sap
|
|
7
|
+
from lumaCLI.metadata.sources.sap.transform import transform
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def pipeline() -> Generator[DatabaseTableManifest, None, None]:
|
|
11
|
+
"""Pipeline to extract SAP metadata and."""
|
|
12
|
+
source = sap().with_resources("column_details", "custom_tables_details")
|
|
13
|
+
manifest_batches = transform()
|
|
14
|
+
yield from manifest_batches
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
if __name__ == "__main__":
|
|
18
|
+
manifest_batches = pipeline()
|
|
19
|
+
for i, manifest in enumerate(manifest_batches):
|
|
20
|
+
logger.info(f"Writing {len(manifest.payload)} tables to batch {i} manifest...")
|
|
21
|
+
|
|
22
|
+
with open(f"database_table_manifest__batch_{i}.json", "w") as f:
|
|
23
|
+
f.write(manifest.json())
|