acryl-datahub 1.2.0.10rc7__py3-none-any.whl → 1.2.0.11rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/METADATA +2690 -2690
- {acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/RECORD +11 -11
- datahub/_version.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +19 -2
- datahub/ingestion/source/s3/source.py +26 -24
- datahub/sql_parsing/split_statements.py +13 -0
- datahub/utilities/file_backed_collections.py +7 -8
- {acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.2.0.
|
|
1
|
+
acryl_datahub-1.2.0.11rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=tz3wAmCES3ENTNYPzXFzMy51fWbS4eqGgC2EztCaRw4,324
|
|
5
5
|
datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
|
|
6
6
|
datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -344,7 +344,7 @@ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=mUWcMt-_FL1SYGIgI4lGZD
|
|
|
344
344
|
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=y-9ZIs_DZPUzYH1CI6HmaAZg3olNNA7MjT8HrCqAI0k,11159
|
|
345
345
|
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=4-qQR_2HGIYU8kC2hRIsJyKKMb9lKq4B6paJm_abUk4,12628
|
|
346
346
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
347
|
-
datahub/ingestion/source/gcs/gcs_source.py,sha256=
|
|
347
|
+
datahub/ingestion/source/gcs/gcs_source.py,sha256=6Kff2FGpR-b_kI5dyMWPgOY2lK9kWVsQv6SdxSp4lYE,8207
|
|
348
348
|
datahub/ingestion/source/gcs/gcs_utils.py,sha256=Kd2usZYIMFeSuE6_tJ4OoHGOdvG8mWaScFuAcIkC6P0,1789
|
|
349
349
|
datahub/ingestion/source/git/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
350
350
|
datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvkYGr9judHJFsOk,4143
|
|
@@ -461,7 +461,7 @@ datahub/ingestion/source/s3/config.py,sha256=lElFXgEpKDT9SVoiXvtx98wV6Gp880qP4pL
|
|
|
461
461
|
datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=FfrcgK-JEF94vw-l3q6pN6FENXb-wZzW2w1VUZVkwW8,3620
|
|
462
462
|
datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
|
|
463
463
|
datahub/ingestion/source/s3/report.py,sha256=9Ej1UCChw963UpGw1-7asi5vFrOM232gfgG8bRdKPp0,667
|
|
464
|
-
datahub/ingestion/source/s3/source.py,sha256=
|
|
464
|
+
datahub/ingestion/source/s3/source.py,sha256=dADORK79xvoYvtnyO6THdRJFw97GovvimVd56GnMtKo,60481
|
|
465
465
|
datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
466
466
|
datahub/ingestion/source/sac/sac.py,sha256=0s_JxHGOhit3Wvgbg7qQi-Z9j9_TgBX_I1yOR3L6-rA,30243
|
|
467
467
|
datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
|
|
@@ -1009,7 +1009,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
|
|
|
1009
1009
|
datahub/sql_parsing/fingerprint_utils.py,sha256=3hGiexaQXnE7eZLxo-t7hlTyVQz7wombAcQELnN-yDY,185
|
|
1010
1010
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
1011
1011
|
datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
|
|
1012
|
-
datahub/sql_parsing/split_statements.py,sha256=
|
|
1012
|
+
datahub/sql_parsing/split_statements.py,sha256=doCACwQ_Fx6m1djo7t3BnU9ZHki4EV2KJUQkFMGv7lg,10101
|
|
1013
1013
|
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=kxxSVe3YNoz_T2OG6-F30ZuXNSXuBZ-E54RqObo6qTI,72323
|
|
1014
1014
|
datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
|
|
1015
1015
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
@@ -1044,7 +1044,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
|
|
|
1044
1044
|
datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
|
|
1045
1045
|
datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
|
|
1046
1046
|
datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
|
|
1047
|
-
datahub/utilities/file_backed_collections.py,sha256=
|
|
1047
|
+
datahub/utilities/file_backed_collections.py,sha256=eOW7_8CzopvzFk1IATVuGGzQvS4yLQzHR_HU3h6T4kY,21675
|
|
1048
1048
|
datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
|
|
1049
1049
|
datahub/utilities/groupby.py,sha256=pe6rP4ZCttYB98yjbs0Aey8C32aLb7rq-NJ_BFky0H4,524
|
|
1050
1050
|
datahub/utilities/hive_schema_to_avro.py,sha256=YCdq3jNUTij8ehWgX9v6CiOrf5aTCXr4DERcp_-wBbo,11608
|
|
@@ -1121,8 +1121,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1121
1121
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1122
1122
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1123
1123
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1124
|
-
acryl_datahub-1.2.0.
|
|
1125
|
-
acryl_datahub-1.2.0.
|
|
1126
|
-
acryl_datahub-1.2.0.
|
|
1127
|
-
acryl_datahub-1.2.0.
|
|
1128
|
-
acryl_datahub-1.2.0.
|
|
1124
|
+
acryl_datahub-1.2.0.11rc1.dist-info/METADATA,sha256=3JRUiyiwDcm9IPEXFumax-ubhLEWe6dka9Jhf3VcDPI,184162
|
|
1125
|
+
acryl_datahub-1.2.0.11rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
1126
|
+
acryl_datahub-1.2.0.11rc1.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
|
|
1127
|
+
acryl_datahub-1.2.0.11rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1128
|
+
acryl_datahub-1.2.0.11rc1.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
|
@@ -37,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
37
37
|
|
|
38
38
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
39
39
|
|
|
40
|
+
GCS_ENDPOINT_URL = "https://storage.googleapis.com"
|
|
41
|
+
|
|
40
42
|
|
|
41
43
|
class HMACKey(ConfigModel):
|
|
42
44
|
hmac_access_id: str = Field(description="Access ID")
|
|
@@ -112,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
112
114
|
s3_config = DataLakeSourceConfig(
|
|
113
115
|
path_specs=s3_path_specs,
|
|
114
116
|
aws_config=AwsConnectionConfig(
|
|
115
|
-
aws_endpoint_url=
|
|
117
|
+
aws_endpoint_url=GCS_ENDPOINT_URL,
|
|
116
118
|
aws_access_key_id=self.config.credential.hmac_access_id,
|
|
117
119
|
aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
|
|
118
120
|
aws_region="auto",
|
|
@@ -121,15 +123,25 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
121
123
|
max_rows=self.config.max_rows,
|
|
122
124
|
number_of_files_to_sample=self.config.number_of_files_to_sample,
|
|
123
125
|
platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
|
|
126
|
+
platform_instance=self.config.platform_instance,
|
|
124
127
|
)
|
|
125
128
|
return s3_config
|
|
126
129
|
|
|
127
130
|
def create_equivalent_s3_path_specs(self):
|
|
128
131
|
s3_path_specs = []
|
|
129
132
|
for path_spec in self.config.path_specs:
|
|
133
|
+
# PathSpec modifies the passed-in include to add /** to the end if
|
|
134
|
+
# autodetecting partitions. Remove that, otherwise creating a new
|
|
135
|
+
# PathSpec will complain.
|
|
136
|
+
# TODO: this should be handled inside PathSpec, which probably shouldn't
|
|
137
|
+
# modify its input.
|
|
138
|
+
include = path_spec.include
|
|
139
|
+
if include.endswith("{table}/**") and not path_spec.allow_double_stars:
|
|
140
|
+
include = include.removesuffix("**")
|
|
141
|
+
|
|
130
142
|
s3_path_specs.append(
|
|
131
143
|
PathSpec(
|
|
132
|
-
include=
|
|
144
|
+
include=include.replace("gs://", "s3://"),
|
|
133
145
|
exclude=(
|
|
134
146
|
[exc.replace("gs://", "s3://") for exc in path_spec.exclude]
|
|
135
147
|
if path_spec.exclude
|
|
@@ -140,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
|
|
|
140
152
|
table_name=path_spec.table_name,
|
|
141
153
|
enable_compression=path_spec.enable_compression,
|
|
142
154
|
sample_files=path_spec.sample_files,
|
|
155
|
+
allow_double_stars=path_spec.allow_double_stars,
|
|
156
|
+
autodetect_partitions=path_spec.autodetect_partitions,
|
|
157
|
+
include_hidden_folders=path_spec.include_hidden_folders,
|
|
158
|
+
tables_filter_pattern=path_spec.tables_filter_pattern,
|
|
159
|
+
traversal_method=path_spec.traversal_method,
|
|
143
160
|
)
|
|
144
161
|
)
|
|
145
162
|
|
|
@@ -115,14 +115,7 @@ profiling_flags_to_report = [
|
|
|
115
115
|
"include_field_sample_values",
|
|
116
116
|
]
|
|
117
117
|
|
|
118
|
-
|
|
119
|
-
# LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
|
|
120
|
-
# path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
|
|
121
|
-
# )
|
|
122
|
-
#
|
|
123
|
-
# LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
|
|
124
|
-
# ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
|
|
125
|
-
# )
|
|
118
|
+
URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
|
|
126
119
|
|
|
127
120
|
|
|
128
121
|
def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
|
|
@@ -448,9 +441,8 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
448
441
|
self.source_config.verify_ssl
|
|
449
442
|
)
|
|
450
443
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
)
|
|
444
|
+
path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
|
|
445
|
+
file = smart_open(path, "rb", transport_params={"client": s3_client})
|
|
454
446
|
else:
|
|
455
447
|
# We still use smart_open here to take advantage of the compression
|
|
456
448
|
# capabilities of smart_open.
|
|
@@ -668,11 +660,9 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
668
660
|
aspects: List[Optional[_Aspect]] = []
|
|
669
661
|
|
|
670
662
|
logger.info(f"Extracting table schema from file: {table_data.full_path}")
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
else table_data.table_path.strip("/")
|
|
675
|
-
)
|
|
663
|
+
|
|
664
|
+
# remove protocol and any leading or trailing slashes
|
|
665
|
+
browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
|
|
676
666
|
|
|
677
667
|
data_platform_urn = make_data_platform_urn(self.source_config.platform)
|
|
678
668
|
logger.info(f"Creating dataset urn with name: {browse_path}")
|
|
@@ -806,10 +796,20 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
806
796
|
else:
|
|
807
797
|
return relative_path
|
|
808
798
|
|
|
809
|
-
def
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
799
|
+
def extract_table_name_and_path(
|
|
800
|
+
self, path_spec: PathSpec, path: str
|
|
801
|
+
) -> Tuple[str, str]:
|
|
802
|
+
# Extract the table name and base path from a path that's been normalized back to the
|
|
803
|
+
# "s3://" scheme that matches the path_spec
|
|
804
|
+
table_name, table_path = path_spec.extract_table_name_and_path(
|
|
805
|
+
self._normalize_uri_for_pattern_matching(path)
|
|
806
|
+
)
|
|
807
|
+
# Then convert the table base path back to the original scheme
|
|
808
|
+
scheme = re.match(URI_SCHEME_REGEX, path)
|
|
809
|
+
if scheme:
|
|
810
|
+
table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
|
|
811
|
+
|
|
812
|
+
return table_name, table_path
|
|
813
813
|
|
|
814
814
|
def extract_table_data(
|
|
815
815
|
self,
|
|
@@ -819,7 +819,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
819
819
|
path = browse_path.file
|
|
820
820
|
partitions = browse_path.partitions
|
|
821
821
|
logger.debug(f"Getting table data for path: {path}")
|
|
822
|
-
table_name, table_path =
|
|
822
|
+
table_name, table_path = self.extract_table_name_and_path(path_spec, path)
|
|
823
823
|
return TableData(
|
|
824
824
|
display_name=table_name,
|
|
825
825
|
is_s3=self.is_s3_platform(),
|
|
@@ -992,7 +992,9 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
992
992
|
)
|
|
993
993
|
|
|
994
994
|
# If partition_id is None, it means the folder is not a partition
|
|
995
|
-
partition_id = path_spec.get_partition_from_path(
|
|
995
|
+
partition_id = path_spec.get_partition_from_path(
|
|
996
|
+
self._normalize_uri_for_pattern_matching(max_file_s3_path)
|
|
997
|
+
)
|
|
996
998
|
|
|
997
999
|
yield Folder(
|
|
998
1000
|
partition_id=partition_id,
|
|
@@ -1143,8 +1145,8 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1143
1145
|
|
|
1144
1146
|
# Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
|
|
1145
1147
|
# This uses the compiled regex pattern to extract the table name from the full path
|
|
1146
|
-
table_name,
|
|
1147
|
-
table_s3_path
|
|
1148
|
+
table_name, _ = self.extract_table_name_and_path(
|
|
1149
|
+
path_spec, table_s3_path
|
|
1148
1150
|
)
|
|
1149
1151
|
|
|
1150
1152
|
# Apply table name filtering if configured
|
|
@@ -52,6 +52,7 @@ class ParserState(Enum):
|
|
|
52
52
|
STRING = 2
|
|
53
53
|
COMMENT = 3
|
|
54
54
|
MULTILINE_COMMENT = 4
|
|
55
|
+
BRACKETED_IDENTIFIER = 5
|
|
55
56
|
|
|
56
57
|
|
|
57
58
|
class _StatementSplitter:
|
|
@@ -141,6 +142,10 @@ class _StatementSplitter:
|
|
|
141
142
|
self.state = ParserState.STRING
|
|
142
143
|
self.current_statement.append(c)
|
|
143
144
|
prev_real_char = c
|
|
145
|
+
elif c == "[":
|
|
146
|
+
self.state = ParserState.BRACKETED_IDENTIFIER
|
|
147
|
+
self.current_statement.append(c)
|
|
148
|
+
prev_real_char = c
|
|
144
149
|
elif c == "-" and next_char == "-":
|
|
145
150
|
self.state = ParserState.COMMENT
|
|
146
151
|
self.current_statement.append(c)
|
|
@@ -172,6 +177,14 @@ class _StatementSplitter:
|
|
|
172
177
|
elif c == "'":
|
|
173
178
|
self.state = ParserState.NORMAL
|
|
174
179
|
|
|
180
|
+
elif self.state == ParserState.BRACKETED_IDENTIFIER:
|
|
181
|
+
self.current_statement.append(c)
|
|
182
|
+
if c == "]" and next_char == "]":
|
|
183
|
+
self.current_statement.append(next_char)
|
|
184
|
+
self.i += 1
|
|
185
|
+
elif c == "]":
|
|
186
|
+
self.state = ParserState.NORMAL
|
|
187
|
+
|
|
175
188
|
elif self.state == ParserState.COMMENT:
|
|
176
189
|
self.current_statement.append(c)
|
|
177
190
|
if c == "\n":
|
|
@@ -33,13 +33,12 @@ from datahub.utilities.sentinels import Unset, unset
|
|
|
33
33
|
|
|
34
34
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
)
|
|
36
|
+
|
|
37
|
+
def _get_sqlite_version_override() -> bool:
|
|
38
|
+
"""Check if SQLite version requirement should be overridden at runtime."""
|
|
39
|
+
override_str = os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
|
|
40
|
+
return bool(override_str and override_str.lower() != "false")
|
|
41
|
+
|
|
43
42
|
|
|
44
43
|
_DEFAULT_FILE_NAME = "sqlite.db"
|
|
45
44
|
_DEFAULT_TABLE_NAME = "data"
|
|
@@ -231,7 +230,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
231
230
|
# We use the ON CONFLICT clause to implement UPSERTs with sqlite.
|
|
232
231
|
# This was added in 3.24.0 from 2018-06-04.
|
|
233
232
|
# See https://www.sqlite.org/lang_conflict.html
|
|
234
|
-
if
|
|
233
|
+
if _get_sqlite_version_override():
|
|
235
234
|
self._use_sqlite_on_conflict = False
|
|
236
235
|
else:
|
|
237
236
|
raise RuntimeError("SQLite version 3.24.0 or later is required")
|
|
File without changes
|
{acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|