acryl-datahub-cloud 0.3.8.1rc1__py3-none-any.whl → 0.3.8.2rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "acryl-datahub-cloud",
3
- "version": "0.3.8.1rc1",
3
+ "version": "0.3.8.2rc1",
4
4
  "install_requires": [
5
5
  "avro-gen3==0.7.16",
6
6
  "acryl-datahub"
@@ -315,6 +315,11 @@ class DataHubFormReportingData(FormData):
315
315
  if p in form_prompts
316
316
  ]:
317
317
  for owner in assignees:
318
+ if form_id not in form_assigned_dates:
319
+ logger.warning(
320
+ f"Form {form_id} not found in form_assigned_dates"
321
+ )
322
+ continue
318
323
  yield FormReportingRow(
319
324
  form_urn=form_id,
320
325
  form_assigned_date=form_assigned_dates[form_id],
@@ -344,6 +349,11 @@ class DataHubFormReportingData(FormData):
344
349
  if p in form_prompts
345
350
  ]:
346
351
  for owner in assignees:
352
+ if form_id not in form_assigned_dates:
353
+ logger.warning(
354
+ f"Form {form_id} not found in form_assigned_dates"
355
+ )
356
+ continue
347
357
  yield FormReportingRow(
348
358
  form_urn=form_id,
349
359
  form_assigned_date=form_assigned_dates[form_id],
@@ -403,7 +413,9 @@ class DataHubFormReportingData(FormData):
403
413
  for p in search_row.completedFormsIncompletePromptIds
404
414
  for p in form_prompts
405
415
  ]:
406
- logger.warning("Unexpected incomplete prompt in completed form")
416
+ logger.warning(
417
+ f"Unexpected incomplete prompt {prompt_id} in completed form {form_id}"
418
+ )
407
419
  for owner in assignees:
408
420
  yield FormReportingRow(
409
421
  form_urn=form_id,
@@ -438,6 +450,11 @@ class DataHubFormReportingData(FormData):
438
450
  if p in form_prompts
439
451
  ]:
440
452
  for owner in assignees:
453
+ if form_id not in form_assigned_dates:
454
+ logger.warning(
455
+ f"Form {form_id} not found in form_assigned_dates"
456
+ )
457
+ continue
441
458
  yield FormReportingRow(
442
459
  form_urn=form_id,
443
460
  form_assigned_date=form_assigned_dates[form_id],
File without changes
@@ -0,0 +1,85 @@
1
+ from typing import Any, Dict, Optional, Union
2
+
3
+ from datahub.ingestion.graph.client import DataHubGraph
4
+
5
+
6
+ def _parse_response(response: Dict) -> Dict:
7
+ value_str = response["value"].strip("{}")
8
+
9
+ args_part = value_str.split(", result=")[0].replace("args=", "")
10
+ result_part = value_str.split(", result=")[1]
11
+
12
+ parsed = {"args": _parse_args(args_part), "result": _parse_result(result_part)}
13
+ return parsed
14
+
15
+
16
+ def _parse_dict(content: str) -> Dict:
17
+ content = content.strip("()").strip()
18
+ items = [item.strip() for item in content.split(", ")]
19
+
20
+ pairs = []
21
+ for item in items:
22
+ if "=" in item:
23
+ k, v = item.split("=", 1)
24
+ pairs.append((k.strip(), v.strip()))
25
+
26
+ return {k: _convert_value(v) for k, v in pairs}
27
+
28
+
29
+ def _parse_args(args_str: str) -> Dict:
30
+ content = args_str.replace("RestoreIndicesArgs(", "").replace(")", "").strip()
31
+ return _parse_dict(content)
32
+
33
+
34
+ def _parse_result(result_str: str) -> Dict:
35
+ content = result_str.replace("RestoreIndicesResult(", "").replace(")", "").strip()
36
+ return _parse_dict(content)
37
+
38
+
39
+ def _convert_value(v: str) -> Any:
40
+ if v == "null":
41
+ return None
42
+ elif v.isdigit():
43
+ return int(v)
44
+ elif v == "[]":
45
+ return []
46
+ elif v == "":
47
+ return ""
48
+ elif v == "false":
49
+ return False
50
+ elif v == "true":
51
+ return True
52
+ elif v.startswith("urn:"):
53
+ return v
54
+ return v
55
+
56
+
57
+ def restore_indices(
58
+ graph: DataHubGraph,
59
+ start: int,
60
+ batch_size: int,
61
+ limit: int,
62
+ urn: Optional[str] = None,
63
+ urn_like: Optional[str] = None,
64
+ aspect: Optional[str] = None,
65
+ ) -> Dict:
66
+ if urn is None and urn_like is None:
67
+ raise RuntimeError("Either urn or urn_like must be present")
68
+
69
+ url = f"{graph.config.server}/operations?action=restoreIndices"
70
+ payload_dict: Dict[str, Union[str, int]] = {
71
+ "start": start,
72
+ "batchSize": batch_size,
73
+ "limit": limit,
74
+ }
75
+ if urn_like is not None:
76
+ payload_dict["urnLike"] = urn_like
77
+ if urn is not None:
78
+ payload_dict["urn"] = urn
79
+ if aspect is not None:
80
+ payload_dict["aspect"] = aspect
81
+ response = graph._post_generic(
82
+ url=url,
83
+ payload_dict=payload_dict,
84
+ )
85
+ return _parse_response(response)
@@ -0,0 +1,139 @@
1
+ import logging
2
+ import time
3
+ from functools import partial
4
+ from typing import Any, Dict, Iterable, List, Optional
5
+
6
+ from pydantic import Field, root_validator
7
+
8
+ from acryl_datahub_cloud.datahub_restore.do_restore import restore_indices
9
+ from datahub.configuration.common import ConfigModel
10
+ from datahub.ingestion.api.common import PipelineContext
11
+ from datahub.ingestion.api.decorators import (
12
+ SupportStatus,
13
+ config_class,
14
+ platform_name,
15
+ support_status,
16
+ )
17
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
18
+ from datahub.ingestion.api.source_helpers import auto_workunit_reporter
19
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
20
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
21
+ StatefulIngestionConfigBase,
22
+ StatefulIngestionReport,
23
+ StatefulIngestionSourceBase,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class DataHubRestoreIndicesReport(StatefulIngestionReport):
30
+ calls_made: int = 0
31
+ aspect_check_ms: int = 0
32
+ create_record_ms: int = 0
33
+ time_get_rows_ms: int = 0
34
+ time_sql_query_ms: int = 0
35
+ rows_migrated: int = 0
36
+ send_message_ms: int = 0
37
+ time_urn_ms: int = 0
38
+ default_aspects_created: int = 0
39
+ ignored: int = 0
40
+ last_aspect: str = ""
41
+ last_urn: str = ""
42
+ time_entity_registry_check_ms: int = 0
43
+
44
+
45
+ class DataHubRestoreIndicesConfig(ConfigModel, StatefulIngestionConfigBase):
46
+ urn: Optional[str] = Field(
47
+ None,
48
+ description="The urn of the entity to restore indices for. If not provided, urn_like must be provided.",
49
+ )
50
+ urn_like: Optional[str] = Field(
51
+ None,
52
+ description="The urn_like of the entity to restore indices for. If not provided, urn must be provided.",
53
+ )
54
+ start: int = Field(
55
+ 0,
56
+ description="Same as restore indices endpoint.",
57
+ )
58
+ batch_size: int = Field(
59
+ 1,
60
+ description="Same as restore indices endpoint.",
61
+ )
62
+ aspect: Optional[str] = Field(
63
+ None,
64
+ description="Same as restore indices endpoint.",
65
+ )
66
+
67
+ @root_validator(pre=True)
68
+ def extract_assertion_info(cls, values: Dict[str, Any]) -> Dict[str, Any]:
69
+ if values.get("urn") is None and values.get("urn_like") is None:
70
+ raise ValueError("Either urn or urn_like must be provided.")
71
+ if values.get("urn") is not None and values.get("urn_like") is not None:
72
+ raise ValueError("Only one of urn or urn_like must be provided.")
73
+ return values
74
+
75
+
76
+ @platform_name(id="datahub", platform_name="datahub")
77
+ @config_class(DataHubRestoreIndicesConfig)
78
+ @support_status(SupportStatus.INCUBATING)
79
+ class DataHubRestoreSource(StatefulIngestionSourceBase):
80
+ def __init__(self, ctx: PipelineContext, config: DataHubRestoreIndicesConfig):
81
+ super().__init__(config=config, ctx=ctx)
82
+ self.report: DataHubRestoreIndicesReport = DataHubRestoreIndicesReport()
83
+ self.report.event_not_produced_warn = False
84
+ self.config = config
85
+ self.graph = ctx.require_graph("The DataHubRestore Source")
86
+ self.last_print_time = 0.0
87
+
88
+ def get_report(self) -> SourceReport:
89
+ return self.report
90
+
91
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
92
+ """A list of functions that transforms the workunits produced by this source.
93
+ Run in order, first in list is applied first. Be careful with order when overriding.
94
+ """
95
+
96
+ return [
97
+ partial(auto_workunit_reporter, self.get_report()),
98
+ ]
99
+
100
+ def _print_report(self) -> None:
101
+ time_taken = round(time.time() - self.last_print_time, 1)
102
+ if time_taken > 60:
103
+ self.last_print_time = time.time()
104
+ logger.info(f"\n{self.report.as_string()}")
105
+
106
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
107
+ start = self.config.start
108
+ while True:
109
+ self.report.calls_made += 1
110
+ response = restore_indices(
111
+ graph=self.graph,
112
+ start=start,
113
+ batch_size=self.config.batch_size,
114
+ limit=self.config.batch_size,
115
+ urn=self.config.urn,
116
+ urn_like=self.config.urn_like,
117
+ aspect=self.config.aspect,
118
+ )
119
+ result = response["result"]
120
+ self.report.aspect_check_ms += result["aspectCheckMs"]
121
+ self.report.create_record_ms += result["createRecordMs"]
122
+ self.report.rows_migrated += result["rowsMigrated"]
123
+ self.report.send_message_ms += result["sendMessageMs"]
124
+ self.report.time_get_rows_ms += result["timeGetRowMs"]
125
+ self.report.time_sql_query_ms += result["timeSqlQueryMs"]
126
+ self.report.time_urn_ms += result["timeUrnMs"]
127
+ self.report.default_aspects_created += result["defaultAspectsCreated"]
128
+ self.report.ignored += result["ignored"]
129
+ self.report.time_entity_registry_check_ms += result[
130
+ "timeEntityRegistryCheckMs"
131
+ ]
132
+ self.report.last_aspect = result["lastAspect"]
133
+ self.report.last_urn = result["lastUrn"]
134
+ self._print_report()
135
+ if result["rowsMigrated"] == self.config.batch_size:
136
+ start += self.config.batch_size
137
+ else:
138
+ break
139
+ yield from []