snowpark-checkpoints-validators 0.1.0rc1__py3-none-any.whl → 0.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-validators
3
- Version: 0.1.0rc1
3
+ Version: 0.1.0rc2
4
4
  Summary: Migration tools for Snowpark
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -244,7 +244,7 @@ Requires-Dist: setuptools>=70.0.0; extra == 'development'
244
244
  Requires-Dist: twine==5.1.1; extra == 'development'
245
245
  Description-Content-Type: text/markdown
246
246
 
247
- # Snowpark Checkpoints Validators
247
+ # snowpark-checkpoints-validators
248
248
 
249
249
  ---
250
250
  **NOTE**
@@ -270,9 +270,16 @@ This package is on Private Preview.
270
270
  The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
271
271
 
272
272
  ```python
273
- from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
273
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
274
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
275
+ from snowflake.snowpark_checkpoints.utils.constant import (
276
+ CheckpointMode,
277
+ )
278
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
279
+ from typing import Any, Optional
274
280
 
275
- validate_dataframe_checkpoint(
281
+ # Signature of the function
282
+ def validate_dataframe_checkpoint(
276
283
  df: SnowparkDataFrame,
277
284
  checkpoint_name: str,
278
285
  job_context: Optional[SnowparkJobContext] = None,
@@ -283,16 +290,17 @@ validate_dataframe_checkpoint(
283
290
  sample_number: Optional[int] = None,
284
291
  sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
285
292
  output_path: Optional[str] = None,
286
- )
293
+ ):
294
+ ...
287
295
  ```
288
296
 
289
- - `df`: Snowpark DataFrame to validate.
290
- - `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
297
+ - `df`: Snowpark dataframe to validate.
298
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
291
299
  - `job_context`: Snowpark job context.
292
300
  - `mode`: Checkpoint mode (schema or data).
293
301
  - `custom_checks`: Custom checks to perform.
294
302
  - `skip_checks`: Checks to skip.
295
- - `sample_frac`: Fraction of the DataFrame to sample.
303
+ - `sample_frac`: Fraction of the dataframe to sample.
296
304
  - `sample_number`: Number of rows to sample.
297
305
  - `sampling_strategy`: Sampling strategy to use.
298
306
  - `output_path`: Output path for the checkpoint report.
@@ -301,16 +309,24 @@ validate_dataframe_checkpoint(
301
309
 
302
310
  ```python
303
311
  from snowflake.snowpark import Session
304
- from snowflake.snowpark import DataFrame as SnowparkDataFrame
312
+ from snowflake.snowpark_checkpoints.utils.constant import (
313
+ CheckpointMode,
314
+ )
305
315
  from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
316
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
317
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
318
+ from pyspark.sql import SparkSession
306
319
 
307
320
  session = Session.builder.getOrCreate()
321
+ job_context = SnowparkJobContext(
322
+ session, SparkSession.builder.getOrCreate(), "job_context", True
323
+ )
308
324
  df = session.read.format("csv").load("data.csv")
309
325
 
310
326
  validate_dataframe_checkpoint(
311
327
  df,
312
328
  "schema_checkpoint",
313
- job_context=session,
329
+ job_context=job_context,
314
330
  mode=CheckpointMode.SCHEMA,
315
331
  sample_frac=0.1,
316
332
  sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
@@ -319,22 +335,24 @@ validate_dataframe_checkpoint(
319
335
 
320
336
  ### Check with Spark Decorator
321
337
 
322
- The `check_with_spark` decorator converts any Snowpark DataFrame arguments to a function, samples them, and converts them to PySpark DataFrames. It then executes a provided Spark function and compares the outputs between the two implementations.
338
+ The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
323
339
 
324
340
  ```python
325
- from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
341
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
342
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
343
+ from typing import Callable, Optional, TypeVar
326
344
 
327
- @check_with_spark(
345
+ fn = TypeVar("F", bound=Callable)
346
+
347
+ # Signature of the decorator
348
+ def check_with_spark(
328
349
  job_context: Optional[SnowparkJobContext],
329
- spark_function: Callable,
350
+ spark_function: fn,
330
351
  checkpoint_name: str,
331
352
  sample_number: Optional[int] = 100,
332
353
  sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
333
- check_dtypes: Optional[bool] = False,
334
- check_with_precision: Optional[bool] = False,
335
354
  output_path: Optional[str] = None,
336
- )
337
- def snowpark_fn(df: SnowparkDataFrame):
355
+ ) -> Callable[[fn], fn]:
338
356
  ...
339
357
  ```
340
358
 
@@ -343,8 +361,6 @@ def snowpark_fn(df: SnowparkDataFrame):
343
361
  - `checkpoint_name`: Name of the check.
344
362
  - `sample_number`: Number of rows to sample.
345
363
  - `sampling_strategy`: Sampling strategy to use.
346
- - `check_dtypes`: Check data types.
347
- - `check_with_precision`: Check with precision.
348
364
  - `output_path`: Output path for the checkpoint report.
349
365
 
350
366
  ### Usage Example
@@ -353,52 +369,63 @@ def snowpark_fn(df: SnowparkDataFrame):
353
369
  from snowflake.snowpark import Session
354
370
  from snowflake.snowpark import DataFrame as SnowparkDataFrame
355
371
  from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
372
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
373
+ from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
356
374
 
357
375
  session = Session.builder.getOrCreate()
358
- df = session.read.format("csv").load("data.csv")
376
+ job_context = SnowparkJobContext(
377
+ session, SparkSession.builder.getOrCreate(), "job_context", True
378
+ )
379
+
380
+ def my_spark_scalar_fn(df: SparkDataFrame):
381
+ return df.count()
359
382
 
360
383
  @check_with_spark(
361
- job_context=session,
362
- spark_function=lambda df: df.withColumn("COLUMN1", df["COLUMN1"] + 1),
363
- checkpoint_name="Check_Column1_Increment",
364
- sample_number=100,
365
- sampling_strategy=SamplingStrategy.RANDOM_SAMPLE,
384
+ job_context=job_context,
385
+ spark_function=my_spark_scalar_fn,
386
+ checkpoint_name="count_checkpoint",
366
387
  )
367
- def increment_column1(df: SnowparkDataFrame):
368
- return df.with_column("COLUMN1", df["COLUMN1"] + 1)
388
+ def my_snowpark_scalar_fn(df: SnowparkDataFrame):
389
+ return df.count()
369
390
 
370
- increment_column1(df)
391
+ df = job_context.snowpark_session.create_dataframe(
392
+ [[1, 2], [3, 4]], schema=["a", "b"]
393
+ )
394
+ count = my_snowpark_scalar_fn(df)
371
395
  ```
372
396
 
373
397
  ### Pandera Snowpark Decorators
374
398
 
375
- The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark DataFrames in the input arguments or in the return value.
399
+ The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
376
400
 
377
401
  ```python
378
- from snowflake.snowpark_checkpoints.checkpoint import check_input_schema, check_output_schema
402
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
403
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
404
+ from pandera import DataFrameSchema
405
+ from typing import Optional
379
406
 
380
- @check_input_schema(
407
+ # Signature of the decorator
408
+ def check_input_schema(
381
409
  pandera_schema: DataFrameSchema,
382
410
  checkpoint_name: str,
383
411
  sample_frac: Optional[float] = 1.0,
384
412
  sample_number: Optional[int] = None,
385
413
  sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
386
- job_context: Optional[SnowparkJobContext],
414
+ job_context: Optional[SnowparkJobContext] = None,
387
415
  output_path: Optional[str] = None,
388
- )
389
- def snowpark_fn(df: SnowparkDataFrame):
416
+ ):
390
417
  ...
391
418
 
392
- @check_output_schema(
419
+ # Signature of the decorator
420
+ def check_output_schema(
393
421
  pandera_schema: DataFrameSchema,
394
422
  checkpoint_name: str,
395
423
  sample_frac: Optional[float] = 1.0,
396
424
  sample_number: Optional[int] = None,
397
425
  sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
398
- job_context: Optional[SnowparkJobContext],
426
+ job_context: Optional[SnowparkJobContext] = None,
399
427
  output_path: Optional[str] = None,
400
- )
401
- def snowpark_fn(df: SnowparkDataFrame):
428
+ ):
402
429
  ...
403
430
  ```
404
431
 
@@ -412,28 +439,71 @@ def snowpark_fn(df: SnowparkDataFrame):
412
439
 
413
440
  ### Usage Example
414
441
 
415
- The following will result in a Pandera `SchemaError`:
442
+ #### Check Input Schema Example
443
+ ```python
444
+ from pandas import DataFrame as PandasDataFrame
445
+ from pandera import DataFrameSchema, Column, Check
446
+ from snowflake.snowpark import Session
447
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
448
+ from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
449
+ from numpy import int8
450
+
451
+ df = PandasDataFrame(
452
+ {
453
+ "COLUMN1": [1, 4, 0, 10, 9],
454
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
455
+ }
456
+ )
416
457
 
458
+ in_schema = DataFrameSchema(
459
+ {
460
+ "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
461
+ "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
462
+ }
463
+ )
464
+
465
+ @check_input_schema(in_schema, "input_schema_checkpoint")
466
+ def preprocessor(dataframe: SnowparkDataFrame):
467
+ dataframe = dataframe.withColumn(
468
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
469
+ )
470
+ return dataframe
471
+
472
+ session = Session.builder.getOrCreate()
473
+ sp_dataframe = session.create_dataframe(df)
474
+
475
+ preprocessed_dataframe = preprocessor(sp_dataframe)
476
+ ```
477
+
478
+ #### Check Input Schema Example
417
479
  ```python
418
480
  from pandas import DataFrame as PandasDataFrame
419
481
  from pandera import DataFrameSchema, Column, Check
420
482
  from snowflake.snowpark import Session
421
483
  from snowflake.snowpark import DataFrame as SnowparkDataFrame
422
484
  from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
485
+ from numpy import int8
423
486
 
424
- df = PandasDataFrame({
425
- "COLUMN1": [1, 4, 0, 10, 9],
426
- "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
427
- })
487
+ df = PandasDataFrame(
488
+ {
489
+ "COLUMN1": [1, 4, 0, 10, 9],
490
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
491
+ }
492
+ )
428
493
 
429
- out_schema = DataFrameSchema({
430
- "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
431
- "COLUMN2": Column(float, Check(lambda x: x < -1.2)),
432
- })
494
+ out_schema = DataFrameSchema(
495
+ {
496
+ "COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
497
+ "COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
498
+ "COLUMN3": Column(float, Check.less_than(10)),
499
+ }
500
+ )
433
501
 
434
502
  @check_output_schema(out_schema, "output_schema_checkpoint")
435
503
  def preprocessor(dataframe: SnowparkDataFrame):
436
- return dataframe.with_column("COLUMN1", lit('Some bad data yo'))
504
+ return dataframe.with_column(
505
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
506
+ )
437
507
 
438
508
  session = Session.builder.getOrCreate()
439
509
  sp_dataframe = session.create_dataframe(df)
@@ -441,6 +511,4 @@ sp_dataframe = session.create_dataframe(df)
441
511
  preprocessed_dataframe = preprocessor(sp_dataframe)
442
512
  ```
443
513
 
444
- ## License
445
-
446
- This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details.
514
+ ------
@@ -0,0 +1,4 @@
1
+ snowpark_checkpoints_validators-0.1.0rc2.dist-info/METADATA,sha256=IgK1FDpHJmzVAtFmrBgQxNUmGCW29DReLcXgPlsXGf8,23867
2
+ snowpark_checkpoints_validators-0.1.0rc2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
3
+ snowpark_checkpoints_validators-0.1.0rc2.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
4
+ snowpark_checkpoints_validators-0.1.0rc2.dist-info/RECORD,,
@@ -1,4 +0,0 @@
1
- snowpark_checkpoints_validators-0.1.0rc1.dist-info/METADATA,sha256=emH8WmKPQHTC4KVL-L9Uqi6USzzWwFuDRT7-AmeCVJo,21852
2
- snowpark_checkpoints_validators-0.1.0rc1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
3
- snowpark_checkpoints_validators-0.1.0rc1.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
4
- snowpark_checkpoints_validators-0.1.0rc1.dist-info/RECORD,,