snowpark-checkpoints-validators 0.1.0rc1__py3-none-any.whl → 0.1.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_validators-0.1.0rc1.dist-info → snowpark_checkpoints_validators-0.1.0rc2.dist-info}/METADATA +120 -52
- snowpark_checkpoints_validators-0.1.0rc2.dist-info/RECORD +4 -0
- snowpark_checkpoints_validators-0.1.0rc1.dist-info/RECORD +0 -4
- {snowpark_checkpoints_validators-0.1.0rc1.dist-info → snowpark_checkpoints_validators-0.1.0rc2.dist-info}/WHEEL +0 -0
- {snowpark_checkpoints_validators-0.1.0rc1.dist-info → snowpark_checkpoints_validators-0.1.0rc2.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-validators
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.0rc2
|
4
4
|
Summary: Migration tools for Snowpark
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -244,7 +244,7 @@ Requires-Dist: setuptools>=70.0.0; extra == 'development'
|
|
244
244
|
Requires-Dist: twine==5.1.1; extra == 'development'
|
245
245
|
Description-Content-Type: text/markdown
|
246
246
|
|
247
|
-
#
|
247
|
+
# snowpark-checkpoints-validators
|
248
248
|
|
249
249
|
---
|
250
250
|
**NOTE**
|
@@ -270,9 +270,16 @@ This package is on Private Preview.
|
|
270
270
|
The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
|
271
271
|
|
272
272
|
```python
|
273
|
-
from snowflake.
|
273
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
274
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
275
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
276
|
+
CheckpointMode,
|
277
|
+
)
|
278
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
279
|
+
from typing import Any, Optional
|
274
280
|
|
275
|
-
|
281
|
+
# Signature of the function
|
282
|
+
def validate_dataframe_checkpoint(
|
276
283
|
df: SnowparkDataFrame,
|
277
284
|
checkpoint_name: str,
|
278
285
|
job_context: Optional[SnowparkJobContext] = None,
|
@@ -283,16 +290,17 @@ validate_dataframe_checkpoint(
|
|
283
290
|
sample_number: Optional[int] = None,
|
284
291
|
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
285
292
|
output_path: Optional[str] = None,
|
286
|
-
)
|
293
|
+
):
|
294
|
+
...
|
287
295
|
```
|
288
296
|
|
289
|
-
- `df`: Snowpark
|
290
|
-
- `checkpoint_name`: Name of the checkpoint schema file or
|
297
|
+
- `df`: Snowpark dataframe to validate.
|
298
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
291
299
|
- `job_context`: Snowpark job context.
|
292
300
|
- `mode`: Checkpoint mode (schema or data).
|
293
301
|
- `custom_checks`: Custom checks to perform.
|
294
302
|
- `skip_checks`: Checks to skip.
|
295
|
-
- `sample_frac`: Fraction of the
|
303
|
+
- `sample_frac`: Fraction of the dataframe to sample.
|
296
304
|
- `sample_number`: Number of rows to sample.
|
297
305
|
- `sampling_strategy`: Sampling strategy to use.
|
298
306
|
- `output_path`: Output path for the checkpoint report.
|
@@ -301,16 +309,24 @@ validate_dataframe_checkpoint(
|
|
301
309
|
|
302
310
|
```python
|
303
311
|
from snowflake.snowpark import Session
|
304
|
-
from snowflake.
|
312
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
313
|
+
CheckpointMode,
|
314
|
+
)
|
305
315
|
from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
|
316
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
317
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
318
|
+
from pyspark.sql import SparkSession
|
306
319
|
|
307
320
|
session = Session.builder.getOrCreate()
|
321
|
+
job_context = SnowparkJobContext(
|
322
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
323
|
+
)
|
308
324
|
df = session.read.format("csv").load("data.csv")
|
309
325
|
|
310
326
|
validate_dataframe_checkpoint(
|
311
327
|
df,
|
312
328
|
"schema_checkpoint",
|
313
|
-
job_context=
|
329
|
+
job_context=job_context,
|
314
330
|
mode=CheckpointMode.SCHEMA,
|
315
331
|
sample_frac=0.1,
|
316
332
|
sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
|
@@ -319,22 +335,24 @@ validate_dataframe_checkpoint(
|
|
319
335
|
|
320
336
|
### Check with Spark Decorator
|
321
337
|
|
322
|
-
The `check_with_spark` decorator converts any Snowpark
|
338
|
+
The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
|
323
339
|
|
324
340
|
```python
|
325
|
-
from snowflake.snowpark_checkpoints.
|
341
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
342
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
343
|
+
from typing import Callable, Optional, TypeVar
|
326
344
|
|
327
|
-
|
345
|
+
fn = TypeVar("F", bound=Callable)
|
346
|
+
|
347
|
+
# Signature of the decorator
|
348
|
+
def check_with_spark(
|
328
349
|
job_context: Optional[SnowparkJobContext],
|
329
|
-
spark_function:
|
350
|
+
spark_function: fn,
|
330
351
|
checkpoint_name: str,
|
331
352
|
sample_number: Optional[int] = 100,
|
332
353
|
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
333
|
-
check_dtypes: Optional[bool] = False,
|
334
|
-
check_with_precision: Optional[bool] = False,
|
335
354
|
output_path: Optional[str] = None,
|
336
|
-
)
|
337
|
-
def snowpark_fn(df: SnowparkDataFrame):
|
355
|
+
) -> Callable[[fn], fn]:
|
338
356
|
...
|
339
357
|
```
|
340
358
|
|
@@ -343,8 +361,6 @@ def snowpark_fn(df: SnowparkDataFrame):
|
|
343
361
|
- `checkpoint_name`: Name of the check.
|
344
362
|
- `sample_number`: Number of rows to sample.
|
345
363
|
- `sampling_strategy`: Sampling strategy to use.
|
346
|
-
- `check_dtypes`: Check data types.
|
347
|
-
- `check_with_precision`: Check with precision.
|
348
364
|
- `output_path`: Output path for the checkpoint report.
|
349
365
|
|
350
366
|
### Usage Example
|
@@ -353,52 +369,63 @@ def snowpark_fn(df: SnowparkDataFrame):
|
|
353
369
|
from snowflake.snowpark import Session
|
354
370
|
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
355
371
|
from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
|
372
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
373
|
+
from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
|
356
374
|
|
357
375
|
session = Session.builder.getOrCreate()
|
358
|
-
|
376
|
+
job_context = SnowparkJobContext(
|
377
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
378
|
+
)
|
379
|
+
|
380
|
+
def my_spark_scalar_fn(df: SparkDataFrame):
|
381
|
+
return df.count()
|
359
382
|
|
360
383
|
@check_with_spark(
|
361
|
-
job_context=
|
362
|
-
spark_function=
|
363
|
-
checkpoint_name="
|
364
|
-
sample_number=100,
|
365
|
-
sampling_strategy=SamplingStrategy.RANDOM_SAMPLE,
|
384
|
+
job_context=job_context,
|
385
|
+
spark_function=my_spark_scalar_fn,
|
386
|
+
checkpoint_name="count_checkpoint",
|
366
387
|
)
|
367
|
-
def
|
368
|
-
return df.
|
388
|
+
def my_snowpark_scalar_fn(df: SnowparkDataFrame):
|
389
|
+
return df.count()
|
369
390
|
|
370
|
-
|
391
|
+
df = job_context.snowpark_session.create_dataframe(
|
392
|
+
[[1, 2], [3, 4]], schema=["a", "b"]
|
393
|
+
)
|
394
|
+
count = my_snowpark_scalar_fn(df)
|
371
395
|
```
|
372
396
|
|
373
397
|
### Pandera Snowpark Decorators
|
374
398
|
|
375
|
-
The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark
|
399
|
+
The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
|
376
400
|
|
377
401
|
```python
|
378
|
-
from snowflake.snowpark_checkpoints.
|
402
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
403
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
404
|
+
from pandera import DataFrameSchema
|
405
|
+
from typing import Optional
|
379
406
|
|
380
|
-
|
407
|
+
# Signature of the decorator
|
408
|
+
def check_input_schema(
|
381
409
|
pandera_schema: DataFrameSchema,
|
382
410
|
checkpoint_name: str,
|
383
411
|
sample_frac: Optional[float] = 1.0,
|
384
412
|
sample_number: Optional[int] = None,
|
385
413
|
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
386
|
-
job_context: Optional[SnowparkJobContext],
|
414
|
+
job_context: Optional[SnowparkJobContext] = None,
|
387
415
|
output_path: Optional[str] = None,
|
388
|
-
)
|
389
|
-
def snowpark_fn(df: SnowparkDataFrame):
|
416
|
+
):
|
390
417
|
...
|
391
418
|
|
392
|
-
|
419
|
+
# Signature of the decorator
|
420
|
+
def check_output_schema(
|
393
421
|
pandera_schema: DataFrameSchema,
|
394
422
|
checkpoint_name: str,
|
395
423
|
sample_frac: Optional[float] = 1.0,
|
396
424
|
sample_number: Optional[int] = None,
|
397
425
|
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
398
|
-
job_context: Optional[SnowparkJobContext],
|
426
|
+
job_context: Optional[SnowparkJobContext] = None,
|
399
427
|
output_path: Optional[str] = None,
|
400
|
-
)
|
401
|
-
def snowpark_fn(df: SnowparkDataFrame):
|
428
|
+
):
|
402
429
|
...
|
403
430
|
```
|
404
431
|
|
@@ -412,28 +439,71 @@ def snowpark_fn(df: SnowparkDataFrame):
|
|
412
439
|
|
413
440
|
### Usage Example
|
414
441
|
|
415
|
-
|
442
|
+
#### Check Input Schema Example
|
443
|
+
```python
|
444
|
+
from pandas import DataFrame as PandasDataFrame
|
445
|
+
from pandera import DataFrameSchema, Column, Check
|
446
|
+
from snowflake.snowpark import Session
|
447
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
448
|
+
from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
|
449
|
+
from numpy import int8
|
450
|
+
|
451
|
+
df = PandasDataFrame(
|
452
|
+
{
|
453
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
454
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
455
|
+
}
|
456
|
+
)
|
416
457
|
|
458
|
+
in_schema = DataFrameSchema(
|
459
|
+
{
|
460
|
+
"COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
|
461
|
+
"COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
|
462
|
+
}
|
463
|
+
)
|
464
|
+
|
465
|
+
@check_input_schema(in_schema, "input_schema_checkpoint")
|
466
|
+
def preprocessor(dataframe: SnowparkDataFrame):
|
467
|
+
dataframe = dataframe.withColumn(
|
468
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
469
|
+
)
|
470
|
+
return dataframe
|
471
|
+
|
472
|
+
session = Session.builder.getOrCreate()
|
473
|
+
sp_dataframe = session.create_dataframe(df)
|
474
|
+
|
475
|
+
preprocessed_dataframe = preprocessor(sp_dataframe)
|
476
|
+
```
|
477
|
+
|
478
|
+
#### Check Input Schema Example
|
417
479
|
```python
|
418
480
|
from pandas import DataFrame as PandasDataFrame
|
419
481
|
from pandera import DataFrameSchema, Column, Check
|
420
482
|
from snowflake.snowpark import Session
|
421
483
|
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
422
484
|
from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
|
485
|
+
from numpy import int8
|
423
486
|
|
424
|
-
df = PandasDataFrame(
|
425
|
-
|
426
|
-
|
427
|
-
|
487
|
+
df = PandasDataFrame(
|
488
|
+
{
|
489
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
490
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
491
|
+
}
|
492
|
+
)
|
428
493
|
|
429
|
-
out_schema = DataFrameSchema(
|
430
|
-
|
431
|
-
|
432
|
-
|
494
|
+
out_schema = DataFrameSchema(
|
495
|
+
{
|
496
|
+
"COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
|
497
|
+
"COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
|
498
|
+
"COLUMN3": Column(float, Check.less_than(10)),
|
499
|
+
}
|
500
|
+
)
|
433
501
|
|
434
502
|
@check_output_schema(out_schema, "output_schema_checkpoint")
|
435
503
|
def preprocessor(dataframe: SnowparkDataFrame):
|
436
|
-
return dataframe.with_column(
|
504
|
+
return dataframe.with_column(
|
505
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
506
|
+
)
|
437
507
|
|
438
508
|
session = Session.builder.getOrCreate()
|
439
509
|
sp_dataframe = session.create_dataframe(df)
|
@@ -441,6 +511,4 @@ sp_dataframe = session.create_dataframe(df)
|
|
441
511
|
preprocessed_dataframe = preprocessor(sp_dataframe)
|
442
512
|
```
|
443
513
|
|
444
|
-
|
445
|
-
|
446
|
-
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details.
|
514
|
+
------
|
@@ -0,0 +1,4 @@
|
|
1
|
+
snowpark_checkpoints_validators-0.1.0rc2.dist-info/METADATA,sha256=IgK1FDpHJmzVAtFmrBgQxNUmGCW29DReLcXgPlsXGf8,23867
|
2
|
+
snowpark_checkpoints_validators-0.1.0rc2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
3
|
+
snowpark_checkpoints_validators-0.1.0rc2.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
|
4
|
+
snowpark_checkpoints_validators-0.1.0rc2.dist-info/RECORD,,
|
@@ -1,4 +0,0 @@
|
|
1
|
-
snowpark_checkpoints_validators-0.1.0rc1.dist-info/METADATA,sha256=emH8WmKPQHTC4KVL-L9Uqi6USzzWwFuDRT7-AmeCVJo,21852
|
2
|
-
snowpark_checkpoints_validators-0.1.0rc1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
3
|
-
snowpark_checkpoints_validators-0.1.0rc1.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
|
4
|
-
snowpark_checkpoints_validators-0.1.0rc1.dist-info/RECORD,,
|
File without changes
|