PyPI - acryl-datahub - Versions diffs - 1.2.0.10rc7__py3-none-any.whl → 1.2.0.11rc1__py3-none-any.whl - Mend

acryl-datahub 1.2.0.10rc7py3-none-any.whl → 1.2.0.11rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (11) hide show

{acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-acryl_datahub-1.2.0.10rc7.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
+acryl_datahub-1.2.0.11rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
 datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
-datahub/_version.py,sha256=cd8EZsDwwIMWrvKB56EZ1C9Dzre3O-4gvEXudlmDjfQ,324
+datahub/_version.py,sha256=tz3wAmCES3ENTNYPzXFzMy51fWbS4eqGgC2EztCaRw4,324
 datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
 datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -344,7 +344,7 @@ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=mUWcMt-_FL1SYGIgI4lGZD
 datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=y-9ZIs_DZPUzYH1CI6HmaAZg3olNNA7MjT8HrCqAI0k,11159
 datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=4-qQR_2HGIYU8kC2hRIsJyKKMb9lKq4B6paJm_abUk4,12628
 datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/gcs/gcs_source.py,sha256=akpLTTOH4FPUn91klyvwY7ARr3x1NYBEdQLy7NqdPFw,7268
+datahub/ingestion/source/gcs/gcs_source.py,sha256=6Kff2FGpR-b_kI5dyMWPgOY2lK9kWVsQv6SdxSp4lYE,8207
 datahub/ingestion/source/gcs/gcs_utils.py,sha256=Kd2usZYIMFeSuE6_tJ4OoHGOdvG8mWaScFuAcIkC6P0,1789
 datahub/ingestion/source/git/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvkYGr9judHJFsOk,4143
@@ -461,7 +461,7 @@ datahub/ingestion/source/s3/config.py,sha256=lElFXgEpKDT9SVoiXvtx98wV6Gp880qP4pL
 datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=FfrcgK-JEF94vw-l3q6pN6FENXb-wZzW2w1VUZVkwW8,3620
 datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
 datahub/ingestion/source/s3/report.py,sha256=9Ej1UCChw963UpGw1-7asi5vFrOM232gfgG8bRdKPp0,667
-datahub/ingestion/source/s3/source.py,sha256=ASuDOr8onfHfP2PexvupZNs-VYViZ56dpgIRyn_oVK0,60242
+datahub/ingestion/source/s3/source.py,sha256=dADORK79xvoYvtnyO6THdRJFw97GovvimVd56GnMtKo,60481
 datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/sac/sac.py,sha256=0s_JxHGOhit3Wvgbg7qQi-Z9j9_TgBX_I1yOR3L6-rA,30243
 datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
@@ -1009,7 +1009,7 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
 datahub/sql_parsing/fingerprint_utils.py,sha256=3hGiexaQXnE7eZLxo-t7hlTyVQz7wombAcQELnN-yDY,185
 datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
 datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
-datahub/sql_parsing/split_statements.py,sha256=OIQXA9e4k3G9Z1y7rbgdtZhMWt4FPnq41cE8Jkm9cBY,9542
+datahub/sql_parsing/split_statements.py,sha256=doCACwQ_Fx6m1djo7t3BnU9ZHki4EV2KJUQkFMGv7lg,10101
 datahub/sql_parsing/sql_parsing_aggregator.py,sha256=kxxSVe3YNoz_T2OG6-F30ZuXNSXuBZ-E54RqObo6qTI,72323
 datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
 datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
@@ -1044,7 +1044,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
 datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
 datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
 datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
-datahub/utilities/file_backed_collections.py,sha256=4_11YQUaIdjr7SUV4AHczArBi8OrrlzuX15ldR1GhKA,21673
+datahub/utilities/file_backed_collections.py,sha256=eOW7_8CzopvzFk1IATVuGGzQvS4yLQzHR_HU3h6T4kY,21675
 datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
 datahub/utilities/groupby.py,sha256=pe6rP4ZCttYB98yjbs0Aey8C32aLb7rq-NJ_BFky0H4,524
 datahub/utilities/hive_schema_to_avro.py,sha256=YCdq3jNUTij8ehWgX9v6CiOrf5aTCXr4DERcp_-wBbo,11608
@@ -1121,8 +1121,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-1.2.0.10rc7.dist-info/METADATA,sha256=kx-48Qes8ImK7vKjHellg40w5JwltF8xYZmAnuIy_JU,184162
-acryl_datahub-1.2.0.10rc7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-acryl_datahub-1.2.0.10rc7.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
-acryl_datahub-1.2.0.10rc7.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-1.2.0.10rc7.dist-info/RECORD,,
+acryl_datahub-1.2.0.11rc1.dist-info/METADATA,sha256=3JRUiyiwDcm9IPEXFumax-ubhLEWe6dka9Jhf3VcDPI,184162
+acryl_datahub-1.2.0.11rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+acryl_datahub-1.2.0.11rc1.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
+acryl_datahub-1.2.0.11rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-1.2.0.11rc1.dist-info/RECORD,,

datahub/_version.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "1.2.0.10rc7"
+__version__ = "1.2.0.11rc1"
 def is_dev_mode() -> bool:

datahub/ingestion/source/gcs/gcs_source.py CHANGED Viewed

@@ -37,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
 logger: logging.Logger = logging.getLogger(__name__)
+GCS_ENDPOINT_URL = "https://storage.googleapis.com"
 class HMACKey(ConfigModel):
     hmac_access_id: str = Field(description="Access ID")
@@ -112,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
         s3_config = DataLakeSourceConfig(
             path_specs=s3_path_specs,
             aws_config=AwsConnectionConfig(
-                aws_endpoint_url="https://storage.googleapis.com",
+                aws_endpoint_url=GCS_ENDPOINT_URL,
                 aws_access_key_id=self.config.credential.hmac_access_id,
                 aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
                 aws_region="auto",
@@ -121,15 +123,25 @@ class GCSSource(StatefulIngestionSourceBase):
             max_rows=self.config.max_rows,
             number_of_files_to_sample=self.config.number_of_files_to_sample,
             platform=PLATFORM_GCS,  # Ensure GCS platform is used for correct container subtypes
+            platform_instance=self.config.platform_instance,
         )
         return s3_config
     def create_equivalent_s3_path_specs(self):
         s3_path_specs = []
         for path_spec in self.config.path_specs:
+            # PathSpec modifies the passed-in include to add /** to the end if
+            # autodetecting partitions. Remove that, otherwise creating a new
+            # PathSpec will complain.
+            # TODO: this should be handled inside PathSpec, which probably shouldn't
+            # modify its input.
+            include = path_spec.include
+            if include.endswith("{table}/**") and not path_spec.allow_double_stars:
+                include = include.removesuffix("**")
             s3_path_specs.append(
                 PathSpec(
-                    include=path_spec.include.replace("gs://", "s3://"),
+                    include=include.replace("gs://", "s3://"),
                     exclude=(
                         [exc.replace("gs://", "s3://") for exc in path_spec.exclude]
                         if path_spec.exclude
@@ -140,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
                     table_name=path_spec.table_name,
                     enable_compression=path_spec.enable_compression,
                     sample_files=path_spec.sample_files,
+                    allow_double_stars=path_spec.allow_double_stars,
+                    autodetect_partitions=path_spec.autodetect_partitions,
+                    include_hidden_folders=path_spec.include_hidden_folders,
+                    tables_filter_pattern=path_spec.tables_filter_pattern,
+                    traversal_method=path_spec.traversal_method,
                 )
             )

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -115,14 +115,7 @@ profiling_flags_to_report = [
     "include_field_sample_values",
 ]
-# LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
-#     path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
-# )
-#
-# LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
-#     ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
-# )
+URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
 def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
@@ -448,9 +441,8 @@ class S3Source(StatefulIngestionSourceBase):
                 self.source_config.verify_ssl
             )
-            file = smart_open(
-                table_data.full_path, "rb", transport_params={"client": s3_client}
-            )
+            path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
+            file = smart_open(path, "rb", transport_params={"client": s3_client})
         else:
             # We still use smart_open here to take advantage of the compression
             # capabilities of smart_open.
@@ -668,11 +660,9 @@ class S3Source(StatefulIngestionSourceBase):
         aspects: List[Optional[_Aspect]] = []
         logger.info(f"Extracting table schema from file: {table_data.full_path}")
-        browse_path: str = (
-            self.strip_s3_prefix(table_data.table_path)
-            if self.is_s3_platform()
-            else table_data.table_path.strip("/")
-        )
+        # remove protocol and any leading or trailing slashes
+        browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
         data_platform_urn = make_data_platform_urn(self.source_config.platform)
         logger.info(f"Creating dataset urn with name: {browse_path}")
@@ -806,10 +796,20 @@ class S3Source(StatefulIngestionSourceBase):
         else:
             return relative_path
-    def extract_table_name(self, path_spec: PathSpec, named_vars: dict) -> str:
-        if path_spec.table_name is None:
-            raise ValueError("path_spec.table_name is not set")
-        return path_spec.table_name.format_map(named_vars)
+    def extract_table_name_and_path(
+        self, path_spec: PathSpec, path: str
+    ) -> Tuple[str, str]:
+        # Extract the table name and base path from a path that's been normalized back to the
+        # "s3://" scheme that matches the path_spec
+        table_name, table_path = path_spec.extract_table_name_and_path(
+            self._normalize_uri_for_pattern_matching(path)
+        )
+        # Then convert the table base path back to the original scheme
+        scheme = re.match(URI_SCHEME_REGEX, path)
+        if scheme:
+            table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
+        return table_name, table_path
     def extract_table_data(
         self,
@@ -819,7 +819,7 @@ class S3Source(StatefulIngestionSourceBase):
         path = browse_path.file
         partitions = browse_path.partitions
         logger.debug(f"Getting table data for path: {path}")
-        table_name, table_path = path_spec.extract_table_name_and_path(path)
+        table_name, table_path = self.extract_table_name_and_path(path_spec, path)
         return TableData(
             display_name=table_name,
             is_s3=self.is_s3_platform(),
@@ -992,7 +992,9 @@ class S3Source(StatefulIngestionSourceBase):
             )
             # If partition_id is None, it means the folder is not a partition
-            partition_id = path_spec.get_partition_from_path(max_file_s3_path)
+            partition_id = path_spec.get_partition_from_path(
+                self._normalize_uri_for_pattern_matching(max_file_s3_path)
+            )
             yield Folder(
                 partition_id=partition_id,
@@ -1143,8 +1145,8 @@ class S3Source(StatefulIngestionSourceBase):
                     # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
                     # This uses the compiled regex pattern to extract the table name from the full path
-                    table_name, table_path = path_spec.extract_table_name_and_path(
-                        table_s3_path
+                    table_name, _ = self.extract_table_name_and_path(
+                        path_spec, table_s3_path
                     )
                     # Apply table name filtering if configured

datahub/sql_parsing/split_statements.py CHANGED Viewed

@@ -52,6 +52,7 @@ class ParserState(Enum):
     STRING = 2
     COMMENT = 3
     MULTILINE_COMMENT = 4
+    BRACKETED_IDENTIFIER = 5
 class _StatementSplitter:
@@ -141,6 +142,10 @@ class _StatementSplitter:
                     self.state = ParserState.STRING
                     self.current_statement.append(c)
                     prev_real_char = c
+                elif c == "[":
+                    self.state = ParserState.BRACKETED_IDENTIFIER
+                    self.current_statement.append(c)
+                    prev_real_char = c
                 elif c == "-" and next_char == "-":
                     self.state = ParserState.COMMENT
                     self.current_statement.append(c)
@@ -172,6 +177,14 @@ class _StatementSplitter:
                 elif c == "'":
                     self.state = ParserState.NORMAL
+            elif self.state == ParserState.BRACKETED_IDENTIFIER:
+                self.current_statement.append(c)
+                if c == "]" and next_char == "]":
+                    self.current_statement.append(next_char)
+                    self.i += 1
+                elif c == "]":
+                    self.state = ParserState.NORMAL
             elif self.state == ParserState.COMMENT:
                 self.current_statement.append(c)
                 if c == "\n":

datahub/utilities/file_backed_collections.py CHANGED Viewed

@@ -33,13 +33,12 @@ from datahub.utilities.sentinels import Unset, unset
 logger: logging.Logger = logging.getLogger(__name__)
-OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR = (
-    os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
-)
-OVERRIDE_SQLITE_VERSION_REQUIREMENT = (
-    OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR
-    and OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR.lower() != "false"
-)
+def _get_sqlite_version_override() -> bool:
+    """Check if SQLite version requirement should be overridden at runtime."""
+    override_str = os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
+    return bool(override_str and override_str.lower() != "false")
 _DEFAULT_FILE_NAME = "sqlite.db"
 _DEFAULT_TABLE_NAME = "data"
@@ -231,7 +230,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
             # We use the ON CONFLICT clause to implement UPSERTs with sqlite.
             # This was added in 3.24.0 from 2018-06-04.
             # See https://www.sqlite.org/lang_conflict.html
-            if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
+            if _get_sqlite_version_override():
                 self._use_sqlite_on_conflict = False
             else:
                 raise RuntimeError("SQLite version 3.24.0 or later is required")

{acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{acryl_datahub-1.2.0.10rc7.dist-info → acryl_datahub-1.2.0.11rc1.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 1.2.0.10rc7__py3-none-any.whl → 1.2.0.11rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.10rc7py3-none-any.whl → 1.2.0.11rc1py3-none-any.whl