acryl-datahub 1.2.0.10rc7__py3-none-any.whl → 1.2.0.11rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.2.0.10rc7.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.2.0.11rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=cd8EZsDwwIMWrvKB56EZ1C9Dzre3O-4gvEXudlmDjfQ,324
4
+ datahub/_version.py,sha256=tz3wAmCES3ENTNYPzXFzMy51fWbS4eqGgC2EztCaRw4,324
5
5
  datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -344,7 +344,7 @@ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=mUWcMt-_FL1SYGIgI4lGZD
344
344
  datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=y-9ZIs_DZPUzYH1CI6HmaAZg3olNNA7MjT8HrCqAI0k,11159
345
345
  datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=4-qQR_2HGIYU8kC2hRIsJyKKMb9lKq4B6paJm_abUk4,12628
346
346
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
347
- datahub/ingestion/source/gcs/gcs_source.py,sha256=akpLTTOH4FPUn91klyvwY7ARr3x1NYBEdQLy7NqdPFw,7268
347
+ datahub/ingestion/source/gcs/gcs_source.py,sha256=6Kff2FGpR-b_kI5dyMWPgOY2lK9kWVsQv6SdxSp4lYE,8207
348
348
  datahub/ingestion/source/gcs/gcs_utils.py,sha256=Kd2usZYIMFeSuE6_tJ4OoHGOdvG8mWaScFuAcIkC6P0,1789
349
349
  datahub/ingestion/source/git/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
350
350
  datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvkYGr9judHJFsOk,4143
@@ -461,7 +461,7 @@ datahub/ingestion/source/s3/config.py,sha256=lElFXgEpKDT9SVoiXvtx98wV6Gp880qP4pL
461
461
  datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=FfrcgK-JEF94vw-l3q6pN6FENXb-wZzW2w1VUZVkwW8,3620
462
462
  datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
463
463
  datahub/ingestion/source/s3/report.py,sha256=9Ej1UCChw963UpGw1-7asi5vFrOM232gfgG8bRdKPp0,667
464
- datahub/ingestion/source/s3/source.py,sha256=ASuDOr8onfHfP2PexvupZNs-VYViZ56dpgIRyn_oVK0,60242
464
+ datahub/ingestion/source/s3/source.py,sha256=dADORK79xvoYvtnyO6THdRJFw97GovvimVd56GnMtKo,60481
465
465
  datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
466
466
  datahub/ingestion/source/sac/sac.py,sha256=0s_JxHGOhit3Wvgbg7qQi-Z9j9_TgBX_I1yOR3L6-rA,30243
467
467
  datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
@@ -1009,7 +1009,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
1009
1009
  datahub/sql_parsing/fingerprint_utils.py,sha256=3hGiexaQXnE7eZLxo-t7hlTyVQz7wombAcQELnN-yDY,185
1010
1010
  datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
1011
1011
  datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
1012
- datahub/sql_parsing/split_statements.py,sha256=OIQXA9e4k3G9Z1y7rbgdtZhMWt4FPnq41cE8Jkm9cBY,9542
1012
+ datahub/sql_parsing/split_statements.py,sha256=doCACwQ_Fx6m1djo7t3BnU9ZHki4EV2KJUQkFMGv7lg,10101
1013
1013
  datahub/sql_parsing/sql_parsing_aggregator.py,sha256=kxxSVe3YNoz_T2OG6-F30ZuXNSXuBZ-E54RqObo6qTI,72323
1014
1014
  datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
1015
1015
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
@@ -1044,7 +1044,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
1044
1044
  datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
1045
1045
  datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
1046
1046
  datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
1047
- datahub/utilities/file_backed_collections.py,sha256=4_11YQUaIdjr7SUV4AHczArBi8OrrlzuX15ldR1GhKA,21673
1047
+ datahub/utilities/file_backed_collections.py,sha256=eOW7_8CzopvzFk1IATVuGGzQvS4yLQzHR_HU3h6T4kY,21675
1048
1048
  datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
1049
1049
  datahub/utilities/groupby.py,sha256=pe6rP4ZCttYB98yjbs0Aey8C32aLb7rq-NJ_BFky0H4,524
1050
1050
  datahub/utilities/hive_schema_to_avro.py,sha256=YCdq3jNUTij8ehWgX9v6CiOrf5aTCXr4DERcp_-wBbo,11608
@@ -1121,8 +1121,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1121
1121
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1122
1122
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1123
1123
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1124
- acryl_datahub-1.2.0.10rc7.dist-info/METADATA,sha256=kx-48Qes8ImK7vKjHellg40w5JwltF8xYZmAnuIy_JU,184162
1125
- acryl_datahub-1.2.0.10rc7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1126
- acryl_datahub-1.2.0.10rc7.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1127
- acryl_datahub-1.2.0.10rc7.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1128
- acryl_datahub-1.2.0.10rc7.dist-info/RECORD,,
1124
+ acryl_datahub-1.2.0.11rc1.dist-info/METADATA,sha256=3JRUiyiwDcm9IPEXFumax-ubhLEWe6dka9Jhf3VcDPI,184162
1125
+ acryl_datahub-1.2.0.11rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1126
+ acryl_datahub-1.2.0.11rc1.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1127
+ acryl_datahub-1.2.0.11rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1128
+ acryl_datahub-1.2.0.11rc1.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.2.0.10rc7"
3
+ __version__ = "1.2.0.11rc1"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -37,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
37
37
 
38
38
  logger: logging.Logger = logging.getLogger(__name__)
39
39
 
40
+ GCS_ENDPOINT_URL = "https://storage.googleapis.com"
41
+
40
42
 
41
43
  class HMACKey(ConfigModel):
42
44
  hmac_access_id: str = Field(description="Access ID")
@@ -112,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
112
114
  s3_config = DataLakeSourceConfig(
113
115
  path_specs=s3_path_specs,
114
116
  aws_config=AwsConnectionConfig(
115
- aws_endpoint_url="https://storage.googleapis.com",
117
+ aws_endpoint_url=GCS_ENDPOINT_URL,
116
118
  aws_access_key_id=self.config.credential.hmac_access_id,
117
119
  aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
118
120
  aws_region="auto",
@@ -121,15 +123,25 @@ class GCSSource(StatefulIngestionSourceBase):
121
123
  max_rows=self.config.max_rows,
122
124
  number_of_files_to_sample=self.config.number_of_files_to_sample,
123
125
  platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
126
+ platform_instance=self.config.platform_instance,
124
127
  )
125
128
  return s3_config
126
129
 
127
130
  def create_equivalent_s3_path_specs(self):
128
131
  s3_path_specs = []
129
132
  for path_spec in self.config.path_specs:
133
+ # PathSpec modifies the passed-in include to add /** to the end if
134
+ # autodetecting partitions. Remove that, otherwise creating a new
135
+ # PathSpec will complain.
136
+ # TODO: this should be handled inside PathSpec, which probably shouldn't
137
+ # modify its input.
138
+ include = path_spec.include
139
+ if include.endswith("{table}/**") and not path_spec.allow_double_stars:
140
+ include = include.removesuffix("**")
141
+
130
142
  s3_path_specs.append(
131
143
  PathSpec(
132
- include=path_spec.include.replace("gs://", "s3://"),
144
+ include=include.replace("gs://", "s3://"),
133
145
  exclude=(
134
146
  [exc.replace("gs://", "s3://") for exc in path_spec.exclude]
135
147
  if path_spec.exclude
@@ -140,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
140
152
  table_name=path_spec.table_name,
141
153
  enable_compression=path_spec.enable_compression,
142
154
  sample_files=path_spec.sample_files,
155
+ allow_double_stars=path_spec.allow_double_stars,
156
+ autodetect_partitions=path_spec.autodetect_partitions,
157
+ include_hidden_folders=path_spec.include_hidden_folders,
158
+ tables_filter_pattern=path_spec.tables_filter_pattern,
159
+ traversal_method=path_spec.traversal_method,
143
160
  )
144
161
  )
145
162
 
@@ -115,14 +115,7 @@ profiling_flags_to_report = [
115
115
  "include_field_sample_values",
116
116
  ]
117
117
 
118
-
119
- # LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
120
- # path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
121
- # )
122
- #
123
- # LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
124
- # ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
125
- # )
118
+ URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
126
119
 
127
120
 
128
121
  def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
@@ -448,9 +441,8 @@ class S3Source(StatefulIngestionSourceBase):
448
441
  self.source_config.verify_ssl
449
442
  )
450
443
 
451
- file = smart_open(
452
- table_data.full_path, "rb", transport_params={"client": s3_client}
453
- )
444
+ path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
445
+ file = smart_open(path, "rb", transport_params={"client": s3_client})
454
446
  else:
455
447
  # We still use smart_open here to take advantage of the compression
456
448
  # capabilities of smart_open.
@@ -668,11 +660,9 @@ class S3Source(StatefulIngestionSourceBase):
668
660
  aspects: List[Optional[_Aspect]] = []
669
661
 
670
662
  logger.info(f"Extracting table schema from file: {table_data.full_path}")
671
- browse_path: str = (
672
- self.strip_s3_prefix(table_data.table_path)
673
- if self.is_s3_platform()
674
- else table_data.table_path.strip("/")
675
- )
663
+
664
+ # remove protocol and any leading or trailing slashes
665
+ browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
676
666
 
677
667
  data_platform_urn = make_data_platform_urn(self.source_config.platform)
678
668
  logger.info(f"Creating dataset urn with name: {browse_path}")
@@ -806,10 +796,20 @@ class S3Source(StatefulIngestionSourceBase):
806
796
  else:
807
797
  return relative_path
808
798
 
809
- def extract_table_name(self, path_spec: PathSpec, named_vars: dict) -> str:
810
- if path_spec.table_name is None:
811
- raise ValueError("path_spec.table_name is not set")
812
- return path_spec.table_name.format_map(named_vars)
799
+ def extract_table_name_and_path(
800
+ self, path_spec: PathSpec, path: str
801
+ ) -> Tuple[str, str]:
802
+ # Extract the table name and base path from a path that's been normalized back to the
803
+ # "s3://" scheme that matches the path_spec
804
+ table_name, table_path = path_spec.extract_table_name_and_path(
805
+ self._normalize_uri_for_pattern_matching(path)
806
+ )
807
+ # Then convert the table base path back to the original scheme
808
+ scheme = re.match(URI_SCHEME_REGEX, path)
809
+ if scheme:
810
+ table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
811
+
812
+ return table_name, table_path
813
813
 
814
814
  def extract_table_data(
815
815
  self,
@@ -819,7 +819,7 @@ class S3Source(StatefulIngestionSourceBase):
819
819
  path = browse_path.file
820
820
  partitions = browse_path.partitions
821
821
  logger.debug(f"Getting table data for path: {path}")
822
- table_name, table_path = path_spec.extract_table_name_and_path(path)
822
+ table_name, table_path = self.extract_table_name_and_path(path_spec, path)
823
823
  return TableData(
824
824
  display_name=table_name,
825
825
  is_s3=self.is_s3_platform(),
@@ -992,7 +992,9 @@ class S3Source(StatefulIngestionSourceBase):
992
992
  )
993
993
 
994
994
  # If partition_id is None, it means the folder is not a partition
995
- partition_id = path_spec.get_partition_from_path(max_file_s3_path)
995
+ partition_id = path_spec.get_partition_from_path(
996
+ self._normalize_uri_for_pattern_matching(max_file_s3_path)
997
+ )
996
998
 
997
999
  yield Folder(
998
1000
  partition_id=partition_id,
@@ -1143,8 +1145,8 @@ class S3Source(StatefulIngestionSourceBase):
1143
1145
 
1144
1146
  # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
1145
1147
  # This uses the compiled regex pattern to extract the table name from the full path
1146
- table_name, table_path = path_spec.extract_table_name_and_path(
1147
- table_s3_path
1148
+ table_name, _ = self.extract_table_name_and_path(
1149
+ path_spec, table_s3_path
1148
1150
  )
1149
1151
 
1150
1152
  # Apply table name filtering if configured
@@ -52,6 +52,7 @@ class ParserState(Enum):
52
52
  STRING = 2
53
53
  COMMENT = 3
54
54
  MULTILINE_COMMENT = 4
55
+ BRACKETED_IDENTIFIER = 5
55
56
 
56
57
 
57
58
  class _StatementSplitter:
@@ -141,6 +142,10 @@ class _StatementSplitter:
141
142
  self.state = ParserState.STRING
142
143
  self.current_statement.append(c)
143
144
  prev_real_char = c
145
+ elif c == "[":
146
+ self.state = ParserState.BRACKETED_IDENTIFIER
147
+ self.current_statement.append(c)
148
+ prev_real_char = c
144
149
  elif c == "-" and next_char == "-":
145
150
  self.state = ParserState.COMMENT
146
151
  self.current_statement.append(c)
@@ -172,6 +177,14 @@ class _StatementSplitter:
172
177
  elif c == "'":
173
178
  self.state = ParserState.NORMAL
174
179
 
180
+ elif self.state == ParserState.BRACKETED_IDENTIFIER:
181
+ self.current_statement.append(c)
182
+ if c == "]" and next_char == "]":
183
+ self.current_statement.append(next_char)
184
+ self.i += 1
185
+ elif c == "]":
186
+ self.state = ParserState.NORMAL
187
+
175
188
  elif self.state == ParserState.COMMENT:
176
189
  self.current_statement.append(c)
177
190
  if c == "\n":
@@ -33,13 +33,12 @@ from datahub.utilities.sentinels import Unset, unset
33
33
 
34
34
  logger: logging.Logger = logging.getLogger(__name__)
35
35
 
36
- OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR = (
37
- os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
38
- )
39
- OVERRIDE_SQLITE_VERSION_REQUIREMENT = (
40
- OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR
41
- and OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR.lower() != "false"
42
- )
36
+
37
+ def _get_sqlite_version_override() -> bool:
38
+ """Check if SQLite version requirement should be overridden at runtime."""
39
+ override_str = os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
40
+ return bool(override_str and override_str.lower() != "false")
41
+
43
42
 
44
43
  _DEFAULT_FILE_NAME = "sqlite.db"
45
44
  _DEFAULT_TABLE_NAME = "data"
@@ -231,7 +230,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
231
230
  # We use the ON CONFLICT clause to implement UPSERTs with sqlite.
232
231
  # This was added in 3.24.0 from 2018-06-04.
233
232
  # See https://www.sqlite.org/lang_conflict.html
234
- if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
233
+ if _get_sqlite_version_override():
235
234
  self._use_sqlite_on_conflict = False
236
235
  else:
237
236
  raise RuntimeError("SQLite version 3.24.0 or later is required")