acryl-datahub 1.2.0.10rc6__py3-none-any.whl → 1.2.0.10rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.2.0.10rc6.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.2.0.10rc8.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=mSBfn0BVaAuAA6tvKy3C6K9XsxhU7zQ1rYUggP0EN0E,324
4
+ datahub/_version.py,sha256=xDxMP7YG6O7bgAgZ4LVjgQh_jfbuV9rXJU1zFdpjQEA,324
5
5
  datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -215,7 +215,7 @@ datahub/ingestion/source/ge_data_profiler.py,sha256=9lEQdLcMBa7znqa6Zz-QWA4Uiv8K
215
215
  datahub/ingestion/source/ge_profiling_config.py,sha256=FIuZtce0gRncSRKA1V9GLg8H5JyJPieZweFJS36Q_CI,11523
216
216
  datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
217
217
  datahub/ingestion/source/ldap.py,sha256=PKoA5pVjuIxFfW1TcbYNIWSm7-C7shK2FDn7Zo5mrVM,18705
218
- datahub/ingestion/source/metabase.py,sha256=j8DRV2GvisezidL1JZ5HJLF_hdFdtvaoyDoEdEyh0Ks,32603
218
+ datahub/ingestion/source/metabase.py,sha256=txzrTtxD1hla3yspxY6GQRCZUFWOK03d0-wJqDmT9AQ,32695
219
219
  datahub/ingestion/source/mlflow.py,sha256=t7heUgivLXU7lxc-ndZxc1LZuoDKZgpSIe-x3ExXfMg,33340
220
220
  datahub/ingestion/source/mode.py,sha256=omehI5t10-TucVTgiREb3s-9suFFy9YsNidF9qtnc-M,72191
221
221
  datahub/ingestion/source/mongodb.py,sha256=ykUA2Jyn0rxzOO-pCWosOqvFTIRgFmqkCTRHAsxpOYc,21423
@@ -344,7 +344,7 @@ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=mUWcMt-_FL1SYGIgI4lGZD
344
344
  datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=y-9ZIs_DZPUzYH1CI6HmaAZg3olNNA7MjT8HrCqAI0k,11159
345
345
  datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=4-qQR_2HGIYU8kC2hRIsJyKKMb9lKq4B6paJm_abUk4,12628
346
346
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
347
- datahub/ingestion/source/gcs/gcs_source.py,sha256=akpLTTOH4FPUn91klyvwY7ARr3x1NYBEdQLy7NqdPFw,7268
347
+ datahub/ingestion/source/gcs/gcs_source.py,sha256=6Kff2FGpR-b_kI5dyMWPgOY2lK9kWVsQv6SdxSp4lYE,8207
348
348
  datahub/ingestion/source/gcs/gcs_utils.py,sha256=Kd2usZYIMFeSuE6_tJ4OoHGOdvG8mWaScFuAcIkC6P0,1789
349
349
  datahub/ingestion/source/git/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
350
350
  datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvkYGr9judHJFsOk,4143
@@ -461,7 +461,7 @@ datahub/ingestion/source/s3/config.py,sha256=lElFXgEpKDT9SVoiXvtx98wV6Gp880qP4pL
461
461
  datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=FfrcgK-JEF94vw-l3q6pN6FENXb-wZzW2w1VUZVkwW8,3620
462
462
  datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
463
463
  datahub/ingestion/source/s3/report.py,sha256=9Ej1UCChw963UpGw1-7asi5vFrOM232gfgG8bRdKPp0,667
464
- datahub/ingestion/source/s3/source.py,sha256=ASuDOr8onfHfP2PexvupZNs-VYViZ56dpgIRyn_oVK0,60242
464
+ datahub/ingestion/source/s3/source.py,sha256=dADORK79xvoYvtnyO6THdRJFw97GovvimVd56GnMtKo,60481
465
465
  datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
466
466
  datahub/ingestion/source/sac/sac.py,sha256=0s_JxHGOhit3Wvgbg7qQi-Z9j9_TgBX_I1yOR3L6-rA,30243
467
467
  datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
@@ -642,12 +642,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
642
642
  datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
643
643
  datahub/lite/lite_util.py,sha256=G0LQHKkyEb1pc_q183g6hflShclGx7kikgMaOxtVVcs,4545
644
644
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
645
- datahub/metadata/_internal_schema_classes.py,sha256=ORmPVDR7_UMk8x4pwbTukK7QZm0oFi9XK0DKa0oPjy0,1069945
646
- datahub/metadata/schema.avsc,sha256=bbtE3veiGe5fqJnMWJTERU99CMOR4CP2lAZidj4UWGE,709640
645
+ datahub/metadata/_internal_schema_classes.py,sha256=BBxTUOoQF1h-WXtAHGQM-Rh3Mhirx5nJBPserPZKFeY,1069945
646
+ datahub/metadata/schema.avsc,sha256=ykx9zsPrLioYBg84eMi4NGyev4POl6BkyAYFPXAjvMQ,771886
647
647
  datahub/metadata/schema_classes.py,sha256=tPT8iHCak4IsZi_oL0nirbPpI8ETTPTZzapqLRpeKU4,1326
648
648
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
649
649
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
650
- datahub/metadata/_urns/urn_defs.py,sha256=tBlEg7f0jaIWVQfpgzTe2gjkthP4janfAwJO7yx6-cw,143257
650
+ datahub/metadata/_urns/urn_defs.py,sha256=_LgqKLHrmHHxpvrP-93NMJSLEnoFI8q72lkX17mK1XA,143257
651
651
  datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
652
652
  datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
653
653
  datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
@@ -973,7 +973,7 @@ datahub/sdk/dataflow.py,sha256=gdAPVVkyKvsKtsa1AwhN_LpzidG_XzV3nhtd1cjnzDA,11128
973
973
  datahub/sdk/datajob.py,sha256=5kU0txTDcn2ce3AhNry83TazPVhoYZ2rAPPNWM1_FP8,13677
974
974
  datahub/sdk/dataset.py,sha256=-C4TCJAs1PFkLAgkUZEU1JOg3orm7AAIkqjw7oo_4PQ,31400
975
975
  datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
976
- datahub/sdk/entity_client.py,sha256=NGVA2CwLqK16EgOPrPiIFodjPD6sM7eQ5E3w5Yl89cM,9428
976
+ datahub/sdk/entity_client.py,sha256=LtFu0lYOl5s_B2G7HXoYY6uXaTBld-MC8Z_UeVKYCbc,9770
977
977
  datahub/sdk/lineage_client.py,sha256=qSe2TEt4HKRVytAsDokkfzqErZiL46c0TMe6g2C5hAg,33766
978
978
  datahub/sdk/main_client.py,sha256=LAymeMOkrjjJjQQ8Nc7G3hvF3P8Y0k0AXrDEGDGt4iU,5706
979
979
  datahub/sdk/mlmodel.py,sha256=cO5R8BYVljmQ0w33RIOuZmj4nq8OJCDVAZGTQI6YFS8,12628
@@ -1009,7 +1009,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
1009
1009
  datahub/sql_parsing/fingerprint_utils.py,sha256=3hGiexaQXnE7eZLxo-t7hlTyVQz7wombAcQELnN-yDY,185
1010
1010
  datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
1011
1011
  datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
1012
- datahub/sql_parsing/split_statements.py,sha256=OIQXA9e4k3G9Z1y7rbgdtZhMWt4FPnq41cE8Jkm9cBY,9542
1012
+ datahub/sql_parsing/split_statements.py,sha256=doCACwQ_Fx6m1djo7t3BnU9ZHki4EV2KJUQkFMGv7lg,10101
1013
1013
  datahub/sql_parsing/sql_parsing_aggregator.py,sha256=kxxSVe3YNoz_T2OG6-F30ZuXNSXuBZ-E54RqObo6qTI,72323
1014
1014
  datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
1015
1015
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
@@ -1121,8 +1121,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1121
1121
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1122
1122
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1123
1123
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1124
- acryl_datahub-1.2.0.10rc6.dist-info/METADATA,sha256=Ruq1g6waFdCzmx2n0vm2h4XcrOfNZpDVjCxZWoLXPZ0,184162
1125
- acryl_datahub-1.2.0.10rc6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1126
- acryl_datahub-1.2.0.10rc6.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1127
- acryl_datahub-1.2.0.10rc6.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1128
- acryl_datahub-1.2.0.10rc6.dist-info/RECORD,,
1124
+ acryl_datahub-1.2.0.10rc8.dist-info/METADATA,sha256=FnAZxap4iq7bDMvUTCEiquA1YbX_NclQz6-LeoenjG0,184162
1125
+ acryl_datahub-1.2.0.10rc8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1126
+ acryl_datahub-1.2.0.10rc8.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1127
+ acryl_datahub-1.2.0.10rc8.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1128
+ acryl_datahub-1.2.0.10rc8.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.2.0.10rc6"
3
+ __version__ = "1.2.0.10rc8"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -37,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
37
37
 
38
38
  logger: logging.Logger = logging.getLogger(__name__)
39
39
 
40
+ GCS_ENDPOINT_URL = "https://storage.googleapis.com"
41
+
40
42
 
41
43
  class HMACKey(ConfigModel):
42
44
  hmac_access_id: str = Field(description="Access ID")
@@ -112,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
112
114
  s3_config = DataLakeSourceConfig(
113
115
  path_specs=s3_path_specs,
114
116
  aws_config=AwsConnectionConfig(
115
- aws_endpoint_url="https://storage.googleapis.com",
117
+ aws_endpoint_url=GCS_ENDPOINT_URL,
116
118
  aws_access_key_id=self.config.credential.hmac_access_id,
117
119
  aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
118
120
  aws_region="auto",
@@ -121,15 +123,25 @@ class GCSSource(StatefulIngestionSourceBase):
121
123
  max_rows=self.config.max_rows,
122
124
  number_of_files_to_sample=self.config.number_of_files_to_sample,
123
125
  platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
126
+ platform_instance=self.config.platform_instance,
124
127
  )
125
128
  return s3_config
126
129
 
127
130
  def create_equivalent_s3_path_specs(self):
128
131
  s3_path_specs = []
129
132
  for path_spec in self.config.path_specs:
133
+ # PathSpec modifies the passed-in include to add /** to the end if
134
+ # autodetecting partitions. Remove that, otherwise creating a new
135
+ # PathSpec will complain.
136
+ # TODO: this should be handled inside PathSpec, which probably shouldn't
137
+ # modify its input.
138
+ include = path_spec.include
139
+ if include.endswith("{table}/**") and not path_spec.allow_double_stars:
140
+ include = include.removesuffix("**")
141
+
130
142
  s3_path_specs.append(
131
143
  PathSpec(
132
- include=path_spec.include.replace("gs://", "s3://"),
144
+ include=include.replace("gs://", "s3://"),
133
145
  exclude=(
134
146
  [exc.replace("gs://", "s3://") for exc in path_spec.exclude]
135
147
  if path_spec.exclude
@@ -140,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
140
152
  table_name=path_spec.table_name,
141
153
  enable_compression=path_spec.enable_compression,
142
154
  sample_files=path_spec.sample_files,
155
+ allow_double_stars=path_spec.allow_double_stars,
156
+ autodetect_partitions=path_spec.autodetect_partitions,
157
+ include_hidden_folders=path_spec.include_hidden_folders,
158
+ tables_filter_pattern=path_spec.tables_filter_pattern,
159
+ traversal_method=path_spec.traversal_method,
143
160
  )
144
161
  )
145
162
 
@@ -13,7 +13,10 @@ from pydantic import Field, root_validator, validator
13
13
  from requests.models import HTTPError
14
14
 
15
15
  import datahub.emitter.mce_builder as builder
16
- from datahub.configuration.source_common import DatasetLineageProviderConfigBase
16
+ from datahub.configuration.source_common import (
17
+ DatasetLineageProviderConfigBase,
18
+ LowerCaseDatasetUrnConfigMixin,
19
+ )
17
20
  from datahub.ingestion.api.common import PipelineContext
18
21
  from datahub.ingestion.api.decorators import (
19
22
  SourceCapability,
@@ -61,7 +64,11 @@ logger = logging.getLogger(__name__)
61
64
  DATASOURCE_URN_RECURSION_LIMIT = 5
62
65
 
63
66
 
64
- class MetabaseConfig(DatasetLineageProviderConfigBase, StatefulIngestionConfigBase):
67
+ class MetabaseConfig(
68
+ DatasetLineageProviderConfigBase,
69
+ StatefulIngestionConfigBase,
70
+ LowerCaseDatasetUrnConfigMixin,
71
+ ):
65
72
  # See the Metabase /api/session endpoint for details
66
73
  # https://www.metabase.com/docs/latest/api-documentation.html#post-apisession
67
74
  connect_uri: str = Field(default="localhost:3000", description="Metabase host URL.")
@@ -115,14 +115,7 @@ profiling_flags_to_report = [
115
115
  "include_field_sample_values",
116
116
  ]
117
117
 
118
-
119
- # LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
120
- # path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
121
- # )
122
- #
123
- # LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
124
- # ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
125
- # )
118
+ URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
126
119
 
127
120
 
128
121
  def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
@@ -448,9 +441,8 @@ class S3Source(StatefulIngestionSourceBase):
448
441
  self.source_config.verify_ssl
449
442
  )
450
443
 
451
- file = smart_open(
452
- table_data.full_path, "rb", transport_params={"client": s3_client}
453
- )
444
+ path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
445
+ file = smart_open(path, "rb", transport_params={"client": s3_client})
454
446
  else:
455
447
  # We still use smart_open here to take advantage of the compression
456
448
  # capabilities of smart_open.
@@ -668,11 +660,9 @@ class S3Source(StatefulIngestionSourceBase):
668
660
  aspects: List[Optional[_Aspect]] = []
669
661
 
670
662
  logger.info(f"Extracting table schema from file: {table_data.full_path}")
671
- browse_path: str = (
672
- self.strip_s3_prefix(table_data.table_path)
673
- if self.is_s3_platform()
674
- else table_data.table_path.strip("/")
675
- )
663
+
664
+ # remove protocol and any leading or trailing slashes
665
+ browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
676
666
 
677
667
  data_platform_urn = make_data_platform_urn(self.source_config.platform)
678
668
  logger.info(f"Creating dataset urn with name: {browse_path}")
@@ -806,10 +796,20 @@ class S3Source(StatefulIngestionSourceBase):
806
796
  else:
807
797
  return relative_path
808
798
 
809
- def extract_table_name(self, path_spec: PathSpec, named_vars: dict) -> str:
810
- if path_spec.table_name is None:
811
- raise ValueError("path_spec.table_name is not set")
812
- return path_spec.table_name.format_map(named_vars)
799
+ def extract_table_name_and_path(
800
+ self, path_spec: PathSpec, path: str
801
+ ) -> Tuple[str, str]:
802
+ # Extract the table name and base path from a path that's been normalized back to the
803
+ # "s3://" scheme that matches the path_spec
804
+ table_name, table_path = path_spec.extract_table_name_and_path(
805
+ self._normalize_uri_for_pattern_matching(path)
806
+ )
807
+ # Then convert the table base path back to the original scheme
808
+ scheme = re.match(URI_SCHEME_REGEX, path)
809
+ if scheme:
810
+ table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
811
+
812
+ return table_name, table_path
813
813
 
814
814
  def extract_table_data(
815
815
  self,
@@ -819,7 +819,7 @@ class S3Source(StatefulIngestionSourceBase):
819
819
  path = browse_path.file
820
820
  partitions = browse_path.partitions
821
821
  logger.debug(f"Getting table data for path: {path}")
822
- table_name, table_path = path_spec.extract_table_name_and_path(path)
822
+ table_name, table_path = self.extract_table_name_and_path(path_spec, path)
823
823
  return TableData(
824
824
  display_name=table_name,
825
825
  is_s3=self.is_s3_platform(),
@@ -992,7 +992,9 @@ class S3Source(StatefulIngestionSourceBase):
992
992
  )
993
993
 
994
994
  # If partition_id is None, it means the folder is not a partition
995
- partition_id = path_spec.get_partition_from_path(max_file_s3_path)
995
+ partition_id = path_spec.get_partition_from_path(
996
+ self._normalize_uri_for_pattern_matching(max_file_s3_path)
997
+ )
996
998
 
997
999
  yield Folder(
998
1000
  partition_id=partition_id,
@@ -1143,8 +1145,8 @@ class S3Source(StatefulIngestionSourceBase):
1143
1145
 
1144
1146
  # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
1145
1147
  # This uses the compiled regex pattern to extract the table name from the full path
1146
- table_name, table_path = path_spec.extract_table_name_and_path(
1147
- table_s3_path
1148
+ table_name, _ = self.extract_table_name_and_path(
1149
+ path_spec, table_s3_path
1148
1150
  )
1149
1151
 
1150
1152
  # Apply table name filtering if configured