data-prep-toolkit 0.2.2.dev1__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/METADATA +33 -1
  2. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/RECORD +34 -28
  3. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/WHEEL +1 -1
  4. data_processing/data_access/data_access.py +4 -1
  5. data_processing/data_access/data_access_local.py +0 -11
  6. data_processing/data_access/data_access_s3.py +0 -11
  7. data_processing/runtime/pure_python/transform_file_processor.py +9 -3
  8. data_processing/runtime/pure_python/transform_orchestrator.py +30 -17
  9. data_processing/runtime/pure_python/transform_runtime.py +9 -1
  10. data_processing/runtime/transform_file_processor.py +53 -32
  11. data_processing/test_support/data_access/data_access_factory_test.py +12 -0
  12. data_processing/test_support/transform/__init__.py +9 -4
  13. data_processing/test_support/transform/noop_folder_transform.py +105 -0
  14. data_processing/test_support/transform/noop_transform.py +3 -3
  15. data_processing/transform/__init__.py +2 -0
  16. data_processing/transform/abstract_transform.py +16 -0
  17. data_processing/transform/binary_transform.py +3 -2
  18. data_processing/transform/folder_transform.py +40 -0
  19. data_processing/transform/transform_configuration.py +3 -3
  20. data_processing/utils/multilock.py +160 -0
  21. data_processing/utils/unrecoverable.py +13 -0
  22. data_processing_ray/runtime/ray/transform_file_processor.py +1 -0
  23. data_processing_ray/runtime/ray/transform_orchestrator.py +18 -10
  24. data_processing_ray/runtime/ray/transform_runtime.py +9 -1
  25. data_processing_ray/test_support/transform/__init__.py +1 -0
  26. data_processing_ray/test_support/transform/noop_folder_transform.py +56 -0
  27. data_processing_ray/test_support/transform/noop_transform.py +1 -3
  28. data_processing_spark/runtime/spark/runtime_configuration.py +13 -0
  29. data_processing_spark/runtime/spark/transform_file_processor.py +4 -1
  30. data_processing_spark/runtime/spark/transform_orchestrator.py +78 -15
  31. data_processing_spark/runtime/spark/transform_runtime.py +24 -6
  32. data_processing_spark/test_support/transform/__init__.py +1 -0
  33. data_processing_spark/test_support/transform/noop_folder_transform.py +53 -0
  34. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.2.dev1
3
+ Version: 0.2.2.dev2
4
4
  Summary: Data Preparation Toolkit Library for Ray and Python
5
5
  Author-email: Maroun Touma <touma@us.ibm.com>
6
6
  License: Apache-2.0
@@ -30,4 +30,36 @@ Requires-Dist: pillow>=10.3.0; extra == "ray"
30
30
  Provides-Extra: spark
31
31
  Requires-Dist: pyspark>=3.5.2; extra == "spark"
32
32
  Requires-Dist: psutil>=6.0.0; extra == "spark"
33
+ Requires-Dist: PyYAML>=6.0.2; extra == "spark"
34
+
35
+ # Data Processing Library
36
+ This provides a python framework for developing _transforms_
37
+ on data stored in files - currently parquet files are supported -
38
+ and running them in a [ray](https://www.ray.io/) cluster.
39
+ Data files may be stored in the local file system or COS/S3.
40
+ For more details see the [documentation](../doc/overview.md).
41
+
42
+ ### Virtual Environment
43
+ The project uses `pyproject.toml` and a Makefile for operations.
44
+ To do development you should establish the virtual environment
45
+ ```shell
46
+ make venv
47
+ ```
48
+ and then either activate
49
+ ```shell
50
+ source venv/bin/activate
51
+ ```
52
+ or set up your IDE to use the venv directory when developing in this project
53
+
54
+ ## Library Artifact Build and Publish
55
+ To test, build and publish the library
56
+ ```shell
57
+ make test build publish
58
+ ```
59
+
60
+ To up the version number, edit the Makefile to change VERSION and rerun
61
+ the above. This will require committing both the `Makefile` and the
62
+ autotmatically updated `pyproject.toml` file.
63
+
64
+
33
65
 
@@ -1,72 +1,78 @@
1
1
  data_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  data_processing/data_access/__init__.py,sha256=996nZjaFiXZ-Zu648oC3eGlvZPTu-DTgsqvlKY4y4Ck,496
3
3
  data_processing/data_access/arrow_s3.py,sha256=0wxYFrw1MSsntO0IbGUGaBuiB366cIAmpsWvycenU_4,9724
4
- data_processing/data_access/data_access.py,sha256=_Xo5FjcJXQvkRlxJp0pfGoCgt50CNO6hmCUbAflLwE0,18632
4
+ data_processing/data_access/data_access.py,sha256=hfERzfigBLgYoq5hsODcgsOMgh6lKISAGy_LP76Y2go,18813
5
5
  data_processing/data_access/data_access_factory.py,sha256=Hdm06HBs_AIjmU4Aqyd_8klZEty57MbjueGlF-8basw,12339
6
6
  data_processing/data_access/data_access_factory_base.py,sha256=dY_69ic4hKCQWSbRPNqxTRs8gcBhbW9rbvcAeeq0iFk,5685
7
- data_processing/data_access/data_access_local.py,sha256=swwLGNaLDFjXr0ViZdsagB3JM5ainOP0stf-hQOvSo8,9229
8
- data_processing/data_access/data_access_s3.py,sha256=QbDWalTiScP8nko7Txa8oVWK0neW6adcLGxjenz0LP0,8249
7
+ data_processing/data_access/data_access_local.py,sha256=5XozY_r_44ExjbFEPAzaIJDNEx3AKSk8h7zmf30Q2N0,8799
8
+ data_processing/data_access/data_access_s3.py,sha256=6cPkLY4-yOhtoCryDDXbEWq4SHJpHEh2_VTnBbsQgsk,7815
9
9
  data_processing/data_access/snapshot_utils.py,sha256=5iT0sP5ZkXQpZmHQh_5Do2W846Xc04-6GPmociqxmAY,1207
10
10
  data_processing/runtime/__init__.py,sha256=E9jeZ28v-TOGGB5O91GZ9GG9dP86bK02-o4qWPj8TNw,389
11
11
  data_processing/runtime/execution_configuration.py,sha256=9EKln8O0bjKRbb7GysegxNQ6t9stI8GqbFIxscPYeak,3218
12
12
  data_processing/runtime/runtime_configuration.py,sha256=uAKr-aawKcr_5Yq9fG8SktHI-Z1nCoVsu84UF2NiV9U,2769
13
- data_processing/runtime/transform_file_processor.py,sha256=-Xdtq0sC99sIB9e6hssSvd_OOFBo9RCu1s8-6q16fi0,9916
13
+ data_processing/runtime/transform_file_processor.py,sha256=SQ_ICPPXCLcBr_X2lmOLiddyjCfz4CCaFtvatu8tcSc,10898
14
14
  data_processing/runtime/transform_launcher.py,sha256=34144dkMHSt58PQ1wbAKnELnhi9lubJcNCBopWoLRqo,4593
15
15
  data_processing/runtime/pure_python/__init__.py,sha256=s0smp908uzeT1cd_6EERaI72WJqiOJ50SDC5yOgDBvc,744
16
16
  data_processing/runtime/pure_python/execution_configuration.py,sha256=C0DCP3YR9sirFQ1Zqz4KcZ03WA5FLXl_kky9PBHREYQ,2617
17
17
  data_processing/runtime/pure_python/runtime_configuration.py,sha256=a4vSY98HfRm2p6pIBNpT-wDEtqRX2RUbRpFYtXyq5Kk,1710
18
- data_processing/runtime/pure_python/transform_file_processor.py,sha256=PYWNUSeb6i6q6Ov7nE0jXQfHIhp1u9adArEU3mQ7B24,4394
18
+ data_processing/runtime/pure_python/transform_file_processor.py,sha256=jN-8OnPgctfY80W13WOnsdxCpDHi4HAgv0bM0Ysx-H4,4586
19
19
  data_processing/runtime/pure_python/transform_invoker.py,sha256=lAG7tfyJyNqtwRB15-db4HJOQsBhT6JahLmjUFQFCRk,5192
20
20
  data_processing/runtime/pure_python/transform_launcher.py,sha256=BDctJnYlR9OVzGCzMwg2cEuGdnV3E9fvhUgoyslvK8k,2447
21
- data_processing/runtime/pure_python/transform_orchestrator.py,sha256=OIQzOL0jT-3ahT7aDs6suySkoEhmvM_T4C_qMDt0JSQ,9468
22
- data_processing/runtime/pure_python/transform_runtime.py,sha256=pWvuGJGAB6M798LJU3FZBG6l35VQCuhsh-SyzSf9ok0,2558
21
+ data_processing/runtime/pure_python/transform_orchestrator.py,sha256=YtELV-ENqqklSGr1N2yqNaSsQZlR0o-lk16PNxDAHYE,10067
22
+ data_processing/runtime/pure_python/transform_runtime.py,sha256=3fz3_c_pYJkShXj_EuCGg9ieIb9ysfc_yhttsSjSDi4,2806
23
23
  data_processing/test_support/__init__.py,sha256=O4lySih15vkOYUSa3uhTaoYw0RrV4rM_sUd691JEuVU,83
24
24
  data_processing/test_support/abstract_test.py,sha256=gZ51wnWITEAyb8BzA2WFCM0quJBxQrlD7WBwUfIsWEA,12875
25
25
  data_processing/test_support/data_access/__init__.py,sha256=q4xqedYF-a1Kkk64i1ToiEW_SZQDMw66mV1s-Owsqq0,69
26
- data_processing/test_support/data_access/data_access_factory_test.py,sha256=mKPPsGeXABjTaw5v1s5CGwELDgEBAJoIybLIFIsCtSM,2669
26
+ data_processing/test_support/data_access/data_access_factory_test.py,sha256=AE1EWibaNtu3svA4s94GFd0RzJtdTyCsBYKnPM0U2Ww,3328
27
27
  data_processing/test_support/launch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  data_processing/test_support/launch/transform_test.py,sha256=vcqLeZZPSiJ0HPkr0qaBSLGhzKMaXeErtQdKu6fsiTs,5435
29
- data_processing/test_support/transform/__init__.py,sha256=pwrft_Q3NQZyySA9rKE-3ORo_Vi-Rq3nKtdD7w-bzXY,213
29
+ data_processing/test_support/transform/__init__.py,sha256=zn8RWuDBFS1JNL1QCVGrGCBEMn6bcEqTaOsWp9XnGTg,502
30
30
  data_processing/test_support/transform/binary_transform_test.py,sha256=ea3ipfxUuOYYxZaNSV-2ad2Q2Oqg32uenMAqxHSbDB4,4573
31
- data_processing/test_support/transform/noop_transform.py,sha256=lDSceUbU_M40Zn2aDrhr3ZsWWB762iREVXmabg4JBX0,5638
31
+ data_processing/test_support/transform/noop_folder_transform.py,sha256=0V74ZVCb6DgOKlgtDCsYPDeYIg6xJm1vLnCiTlAjqeM,4410
32
+ data_processing/test_support/transform/noop_transform.py,sha256=9OnvZ5vhfveTjWSILnT8FyhvEL57yTIc_3wFi1R8YFw,5697
32
33
  data_processing/test_support/transform/table_transform_test.py,sha256=nTlomV3-X9kkiWaR7AF_qFp5T2fWWSoDCEi8p5SpqtI,4619
33
- data_processing/transform/__init__.py,sha256=tLmeKWCRKDEjMRyjOLbjmJFQHGcL0i-4YeTJF41bWjg,342
34
- data_processing/transform/binary_transform.py,sha256=Nh8C4qkvuxdku16CgTIHiVO1hnp2I9TzaYw5YN7qa9E,2916
34
+ data_processing/transform/__init__.py,sha256=RqHj9FCbkKsYqeZRdFLiFapnJkWaj0AhPAdB0-Fy_BE,496
35
+ data_processing/transform/abstract_transform.py,sha256=nSPaJiu-M1Ptzmixw3KgSlKXJyeFp7J8PIGkt9ULiog,738
36
+ data_processing/transform/binary_transform.py,sha256=oM_v5_HsgM914FamzWP_IgpXxp4cxSGzer9io-G2c8k,2982
37
+ data_processing/transform/folder_transform.py,sha256=XtdSHlvs4qbVTfXAWIVbpPtSnQjMQOYvJq8WZHeSdtU,1910
35
38
  data_processing/transform/table_transform.py,sha256=BliQKz7ShFTpXRe9WBYAeRM7n-AhEWYj5kb05QAxS2c,6281
36
- data_processing/transform/transform_configuration.py,sha256=uBrhCafxvy8bc5E11QwdQT9USrUZtiYjSp_PQH5OmFE,4413
39
+ data_processing/transform/transform_configuration.py,sha256=4hLPy78eMpFw0yKQy2ICZcFicUz--kvgUFAXorUkkwE,4395
37
40
  data_processing/transform/transform_statistics.py,sha256=eeInqfFFxhmQoaMJXXaOq_zY49mmQYAA_9xmdwi5vDw,1437
38
41
  data_processing/utils/__init__.py,sha256=lL25FenOmye6EBtcQLbczJJ3NtrGlmW9v0hDQfKrCno,584
39
42
  data_processing/utils/cli_utils.py,sha256=vjs6qOxxYetEYCYx_8jX_KBVwA7z_iBYFPhyPyucCGM,3135
40
43
  data_processing/utils/config.py,sha256=GZgmmvjiSEL86Vk_wx7lJ-Y27ItQXujthb5mcdV4FMA,1695
41
44
  data_processing/utils/log.py,sha256=t7S_ITjWJicEaCPcSk3cKKXyzEe1u21Ov5aQz3CzrZQ,2226
45
+ data_processing/utils/multilock.py,sha256=ra5Qy61zqmdPNhAqYNBXLG6L6LjTxTyTpDLMOuc99x0,5734
42
46
  data_processing/utils/params_utils.py,sha256=oAKY3wC8b17rDUJGqX19-rAQHDc9SQn1ksTeo3RFgCY,6668
43
47
  data_processing/utils/pipinstaller.py,sha256=PxFNwEy8v4FqjwYgrPhH0UTrCgsJvM5WAE2fKylsk2Q,2511
44
48
  data_processing/utils/transform_configuration.json,sha256=6YBw0Hk2mokY6JBn1kR6L9AkV_yivbFrpSoHecAJp9o,4562
45
49
  data_processing/utils/transform_configurator.py,sha256=9OHSCQ8rFSoDdMW6ZCHYdNe6thRwV9zOaRPnLkWNMYE,3601
46
50
  data_processing/utils/transform_utils.py,sha256=KGNioN35B1i1h-MIsfm3QvXLlU1aGXimheva7NbUhMM,8496
47
- data_processing/utils/unrecoverable.py,sha256=svNdVzQaArnf8GdLvB2nP9miv7kYe3bDfFRW--SWvbU,171
51
+ data_processing/utils/unrecoverable.py,sha256=cbF74AGK1IdRor_L1w_hPwglV_b2blP6Ad4ET79xrl0,831
48
52
  data_processing_ray/runtime/ray/__init__.py,sha256=vjQOvb_OJNq3c1F_tG3WjO-pciY77Z1lETO2Ha_GVbw,784
49
53
  data_processing_ray/runtime/ray/execution_configuration.py,sha256=C9YFixlATr7PPpkVQ0WzjCCPTWFuP80W2rnzY1bbp5I,4628
50
54
  data_processing_ray/runtime/ray/ray_utils.py,sha256=eDPm-pybPOELjKkvoz3l-qFU-k1Iwh-giGlXULiZjEk,10212
51
55
  data_processing_ray/runtime/ray/runtime_configuration.py,sha256=js9dXwdxjYbSigMC49F07XmbLjmj9HiipPE6BDaIGfA,1691
52
- data_processing_ray/runtime/ray/transform_file_processor.py,sha256=eR814VvfmPOlvyv_FU7eyt0HRqIzkkvACURxJCK-xrM,2335
56
+ data_processing_ray/runtime/ray/transform_file_processor.py,sha256=vciH2OViK_RLSzze34kaLkRNi5Bg2JmH-KQAYx5zoo0,2388
53
57
  data_processing_ray/runtime/ray/transform_invoker.py,sha256=apfH8uilpm9sJ4IpHgiNdIzcH_IHGQba5fui4cCfolk,4026
54
58
  data_processing_ray/runtime/ray/transform_launcher.py,sha256=oxI3MFZI_-LzTwHbrHBIUqJ0htnliKBuALt86qijRwU,4304
55
- data_processing_ray/runtime/ray/transform_orchestrator.py,sha256=FZl7NM0eU1SxOcavZm4lru3laCswACPB1rjk3KK3FtY,7102
56
- data_processing_ray/runtime/ray/transform_runtime.py,sha256=0-b5syOW9zNnZxmMHDdwPo_pvoqDBiM5dHCgSakZhGQ,2531
59
+ data_processing_ray/runtime/ray/transform_orchestrator.py,sha256=ZY4Wu7luS3kiUWAsTPZAJg4upyNRwGl2Ez0K832vAIw,7552
60
+ data_processing_ray/runtime/ray/transform_runtime.py,sha256=s-9lrPBg-ifRRYJPJfbL8P9cOpYdvQKcairxpaIbB4E,2779
57
61
  data_processing_ray/runtime/ray/transform_statistics.py,sha256=cxrSQVnzRBCGS68IoiVGLoRBWBxPBSFFMDiT29FNt0g,3749
58
- data_processing_ray/test_support/transform/__init__.py,sha256=CKk-J3aEwH7OgDardyUEbLjlWaZWLUBs93PdukT4Rbc,100
59
- data_processing_ray/test_support/transform/noop_transform.py,sha256=ZTx09M9vNOaqrVzeuT2VmWM-IF4Upip0g0EtbHaOn-0,1588
62
+ data_processing_ray/test_support/transform/__init__.py,sha256=P9stuAcnpjeGrJ8HJKRTuT2BKwLtG_3uG1UCYpnkw8c,213
63
+ data_processing_ray/test_support/transform/noop_folder_transform.py,sha256=Sw5heLb4EyPYpTjoCW1K53FOt7BH1Zob2bLZUQbwmfs,2075
64
+ data_processing_ray/test_support/transform/noop_transform.py,sha256=97o8oizjhBR_dqdrXhwss5QvW8-FZ_QYfTQGXpyUpGk,1564
60
65
  data_processing_spark/runtime/spark/__init__.py,sha256=bhY1xI9lL0GR2v1APahlhC5sh5rdVcGhQbWN4yoXApw,1233
61
66
  data_processing_spark/runtime/spark/execution_configuration.py,sha256=BqxUlpXFdHRK-csO2jaJJtktyKbcMtjIn3sjAPBfO58,3643
62
- data_processing_spark/runtime/spark/runtime_configuration.py,sha256=uABzBvzzFZ5HA_lGYEBFRd1qViMj5sbpKtCSLy64riM,1705
63
- data_processing_spark/runtime/spark/transform_file_processor.py,sha256=sdDBZZyqCqyKaJmEqZh1QzIqCDkLQzqV0dcAI5TRBjo,2611
67
+ data_processing_spark/runtime/spark/runtime_configuration.py,sha256=lAx7aGLq0_-ySJSc_fx5ek1bIgLcGUgfmAM-HcVLkqc,2377
68
+ data_processing_spark/runtime/spark/transform_file_processor.py,sha256=E0JEoEvAqdygYBHkvlm5kdkNbPO4aPQljfk833_Fqv8,2682
64
69
  data_processing_spark/runtime/spark/transform_launcher.py,sha256=1PZ-N4Wy2Qqiqr2z9S1xV88cNsAoHrmmuPadiOakJLM,2479
65
- data_processing_spark/runtime/spark/transform_orchestrator.py,sha256=HadnLNx_icy2n7CXOwqLiUA7vjV-gOvajxE0AQU3_NM,6645
66
- data_processing_spark/runtime/spark/transform_runtime.py,sha256=IKChGY1uGxFlAqZaL-XeSv_J3BMm3nev9MAs0NTT8og,2506
67
- data_processing_spark/test_support/transform/__init__.py,sha256=v58HbP2x9KF8MG8SOGWjodrTjU57KXlL0aPPB7z8KQQ,755
70
+ data_processing_spark/runtime/spark/transform_orchestrator.py,sha256=64lme-KZz-a1CQUThYnLtx8m99VMlYv8MJB7dHCIAo4,9470
71
+ data_processing_spark/runtime/spark/transform_runtime.py,sha256=je27rTRdd-5Wtd8nc8ogUwxZqUd4ZgP5122tA8JtTKA,3258
72
+ data_processing_spark/test_support/transform/__init__.py,sha256=FQJyj7z1hXQynngMVQlCTJxTh2bdc4jN4220CBmLTqE,872
73
+ data_processing_spark/test_support/transform/noop_folder_transform.py,sha256=z0jXCVKJYHPqB9ZTfUxnQkUVDnmfWjvss4_I3QZ8JZ4,2187
68
74
  data_processing_spark/test_support/transform/noop_transform.py,sha256=0FR3o-LnXf-UFS5gU0j-i4LVlw1mHDxGaPI40dkkIKY,1694
69
- data_prep_toolkit-0.2.2.dev1.dist-info/METADATA,sha256=QiZEK2qc8or6csEZk_weYltxiDRFvDb0chVpwLMCMrU,1235
70
- data_prep_toolkit-0.2.2.dev1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
71
- data_prep_toolkit-0.2.2.dev1.dist-info/top_level.txt,sha256=XGMDmY55_pe5KeRWvO0un9a640e2v99tzbBBtjNybPM,58
72
- data_prep_toolkit-0.2.2.dev1.dist-info/RECORD,,
75
+ data_prep_toolkit-0.2.2.dev2.dist-info/METADATA,sha256=XgskYjPA5pddqDgaBrPpe1IeqOpHPB2WscNM4dRh7XQ,2240
76
+ data_prep_toolkit-0.2.2.dev2.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
77
+ data_prep_toolkit-0.2.2.dev2.dist-info/top_level.txt,sha256=XGMDmY55_pe5KeRWvO0un9a640e2v99tzbBBtjNybPM,58
78
+ data_prep_toolkit-0.2.2.dev2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -358,7 +358,10 @@ class DataAccess:
358
358
  :param path: input file location
359
359
  :return: output file location
360
360
  """
361
- raise NotImplementedError("Subclasses should implement this!")
361
+ if self.get_output_folder() is None:
362
+ self.logger.error("Get out put location. S3 configuration is not provided, returning None")
363
+ return None
364
+ return path.replace(self.get_input_folder(), self.get_output_folder())
362
365
 
363
366
  def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
364
367
  """
@@ -130,17 +130,6 @@ class DataAccessLocal(DataAccess):
130
130
  logger.error(f"Error reading table from {path}: {e}")
131
131
  return None, 0
132
132
 
133
- def get_output_location(self, path: str) -> str:
134
- """
135
- Get output location based on input
136
- :param path: input file location
137
- :return: output file location
138
- """
139
- if self.output_folder is None:
140
- logger.error("Get output location. local configuration is not defined, returning None")
141
- return None
142
- return path.replace(self.input_folder, self.output_folder)
143
-
144
133
  def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
145
134
  """
146
135
  Saves a pyarrow table to a file and returns information about the operation.
@@ -126,17 +126,6 @@ class DataAccessS3(DataAccess):
126
126
  self.logger.error(f"Exception reading table {path} from S3 - {e}")
127
127
  return None, 0
128
128
 
129
- def get_output_location(self, path: str) -> str:
130
- """
131
- Get output location based on input
132
- :param path: input file location
133
- :return: output file location
134
- """
135
- if self.output_folder is None:
136
- self.logger.error("Get out put location. S3 configuration is not provided, returning None")
137
- return None
138
- return path.replace(self.input_folder, self.output_folder)
139
-
140
129
  def save_table(self, path: str, table: pyarrow.Table) -> tuple[int, dict[str, Any], int]:
141
130
  """
142
131
  Save table to a given location
@@ -14,7 +14,7 @@ from typing import Any
14
14
 
15
15
  from data_processing.data_access import DataAccessFactoryBase
16
16
  from data_processing.runtime import AbstractTransformFileProcessor
17
- from data_processing.transform import AbstractBinaryTransform, TransformStatistics
17
+ from data_processing.transform import AbstractTransform, TransformStatistics
18
18
  from data_processing.utils import UnrecoverableException
19
19
 
20
20
 
@@ -28,7 +28,8 @@ class PythonTransformFileProcessor(AbstractTransformFileProcessor):
28
28
  data_access_factory: DataAccessFactoryBase,
29
29
  statistics: TransformStatistics,
30
30
  transform_params: dict[str, Any],
31
- transform_class: type[AbstractBinaryTransform],
31
+ transform_class: type[AbstractTransform],
32
+ is_folder: bool,
32
33
  ):
33
34
  """
34
35
  Init method
@@ -36,11 +37,13 @@ class PythonTransformFileProcessor(AbstractTransformFileProcessor):
36
37
  :param statistics - reference to statistics class
37
38
  :param transform_params - transform parameters
38
39
  :param transform_class: transform class
40
+ :param is_folder: folder transform flag
39
41
  """
40
42
  # invoke superclass
41
43
  super().__init__(
42
44
  data_access_factory=data_access_factory,
43
45
  transform_parameters=dict(transform_params),
46
+ is_folder=is_folder,
44
47
  )
45
48
  self.transform_params["statistics"] = statistics
46
49
  # Create local processor
@@ -65,17 +68,20 @@ class PythonPoolTransformFileProcessor(AbstractTransformFileProcessor):
65
68
  self,
66
69
  data_access_factory: DataAccessFactoryBase,
67
70
  transform_params: dict[str, Any],
68
- transform_class: type[AbstractBinaryTransform],
71
+ transform_class: type[AbstractTransform],
72
+ is_folder: bool
69
73
  ):
70
74
  """
71
75
  Init method
72
76
  :param data_access_factory - data access factory
73
77
  :param transform_params - transform parameters
74
78
  :param transform_class: transform class
79
+ :param is_folder: folder tranform flag
75
80
  """
76
81
  super().__init__(
77
82
  data_access_factory=data_access_factory,
78
83
  transform_parameters=dict(transform_params),
84
+ is_folder=is_folder,
79
85
  )
80
86
  # Add data access and statistics to the processor parameters
81
87
  self.transform_params["data_access"] = self.data_access
@@ -24,14 +24,13 @@ from data_processing.runtime.pure_python import (
24
24
  PythonTransformFileProcessor,
25
25
  PythonTransformRuntimeConfiguration,
26
26
  )
27
- from data_processing.transform import AbstractBinaryTransform, TransformStatistics
27
+ from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform
28
28
  from data_processing.utils import GB, get_logger
29
29
 
30
30
 
31
31
  logger = get_logger(__name__)
32
32
 
33
33
 
34
- @staticmethod
35
34
  def _execution_resources() -> dict[str, Any]:
36
35
  """
37
36
  Get Execution resource
@@ -49,7 +48,6 @@ def _execution_resources() -> dict[str, Any]:
49
48
  }
50
49
 
51
50
 
52
-
53
51
  def orchestrate(
54
52
  data_access_factory: DataAccessFactoryBase,
55
53
  runtime_config: PythonTransformRuntimeConfiguration,
@@ -74,15 +72,21 @@ def orchestrate(
74
72
  return 1
75
73
  # create additional execution parameters
76
74
  runtime = runtime_config.create_transform_runtime()
75
+ is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
77
76
  try:
78
- # Get files to process
79
- files, profile, retries = data_access.get_files_to_process()
80
- if len(files) == 0:
81
- logger.error("No input files to process - exiting")
82
- return 0
83
- if retries > 0:
84
- statistics.add_stats({"data access retries": retries})
85
- logger.info(f"Number of files is {len(files)}, source profile {profile}")
77
+ if is_folder:
78
+ # folder transform
79
+ files = runtime.get_folders(data_access=data_access)
80
+ logger.info(f"Number of folders is {len(files)}")
81
+ else:
82
+ # Get files to process
83
+ files, profile, retries = data_access.get_files_to_process()
84
+ if len(files) == 0:
85
+ logger.error("No input files to process - exiting")
86
+ return 0
87
+ if retries > 0:
88
+ statistics.add_stats({"data access retries": retries})
89
+ logger.info(f"Number of files is {len(files)}, source profile {profile}")
86
90
  # Print interval
87
91
  print_interval = int(len(files) / 100)
88
92
  if print_interval == 0:
@@ -99,6 +103,7 @@ def orchestrate(
99
103
  data_access_factory=data_access_factory, statistics=statistics, files=files
100
104
  ),
101
105
  transform_class=runtime_config.get_transform_class(),
106
+ is_folder=is_folder,
102
107
  )
103
108
  else:
104
109
  # using sequential execution
@@ -111,6 +116,7 @@ def orchestrate(
111
116
  data_access_factory=data_access_factory, statistics=statistics, files=files
112
117
  ),
113
118
  transform_class=runtime_config.get_transform_class(),
119
+ is_folder=is_folder,
114
120
  )
115
121
  status = "success"
116
122
  return_code = 0
@@ -139,7 +145,8 @@ def orchestrate(
139
145
  "job_input_params": input_params
140
146
  | data_access_factory.get_input_params()
141
147
  | execution_config.get_input_params(),
142
- "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
148
+ "execution_stats": _execution_resources() |
149
+ {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
143
150
  "job_output_stats": stats,
144
151
  }
145
152
  logger.debug(f"Saving job metadata: {metadata}.")
@@ -157,7 +164,8 @@ def _process_transforms(
157
164
  data_access_factory: DataAccessFactoryBase,
158
165
  statistics: TransformStatistics,
159
166
  transform_params: dict[str, Any],
160
- transform_class: type[AbstractBinaryTransform],
167
+ transform_class: type[AbstractTransform],
168
+ is_folder: bool,
161
169
  ) -> None:
162
170
  """
163
171
  Process transforms sequentially
@@ -167,9 +175,8 @@ def _process_transforms(
167
175
  :param data_access_factory: data access factory
168
176
  :param transform_params - transform parameters
169
177
  :param transform_class: transform class
178
+ :param is_folder: folder transform flag
170
179
  :return: metadata for the execution
171
-
172
- :return: None
173
180
  """
174
181
  # create executor
175
182
  executor = PythonTransformFileProcessor(
@@ -177,6 +184,7 @@ def _process_transforms(
177
184
  statistics=statistics,
178
185
  transform_params=transform_params,
179
186
  transform_class=transform_class,
187
+ is_folder=is_folder,
180
188
  )
181
189
  # process data
182
190
  t_start = time.time()
@@ -202,7 +210,8 @@ def _process_transforms_multiprocessor(
202
210
  print_interval: int,
203
211
  data_access_factory: DataAccessFactoryBase,
204
212
  transform_params: dict[str, Any],
205
- transform_class: type[AbstractBinaryTransform],
213
+ transform_class: type[AbstractTransform],
214
+ is_folder: bool
206
215
  ) -> TransformStatistics:
207
216
  """
208
217
  Process transforms using multiprocessing pool
@@ -212,13 +221,17 @@ def _process_transforms_multiprocessor(
212
221
  :param data_access_factory: data access factory
213
222
  :param transform_params - transform parameters
214
223
  :param transform_class: transform class
224
+ :param is_folder: folder transform class
215
225
  :return: metadata for the execution
216
226
  """
217
227
  # result statistics
218
228
  statistics = TransformStatistics()
219
229
  # create processor
220
230
  processor = PythonPoolTransformFileProcessor(
221
- data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class
231
+ data_access_factory=data_access_factory,
232
+ transform_params=transform_params,
233
+ transform_class=transform_class,
234
+ is_folder=is_folder,
222
235
  )
223
236
  completed = 0
224
237
  t_start = time.time()
@@ -12,7 +12,7 @@
12
12
 
13
13
  from typing import Any
14
14
 
15
- from data_processing.data_access import DataAccessFactoryBase
15
+ from data_processing.data_access import DataAccessFactoryBase, DataAccess
16
16
  from data_processing.transform import TransformStatistics
17
17
 
18
18
 
@@ -28,6 +28,14 @@ class DefaultPythonTransformRuntime:
28
28
  """
29
29
  self.params = params
30
30
 
31
+ def get_folders(self, data_access: DataAccess) -> list[str]:
32
+ """
33
+ Get folders to process
34
+ :param data_access: data access
35
+ :return: list of folders to process
36
+ """
37
+ raise NotImplemented()
38
+
31
39
  def get_transform_config(
32
40
  self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
33
41
  ) -> dict[str, Any]:
@@ -26,11 +26,13 @@ class AbstractTransformFileProcessor:
26
26
  self,
27
27
  data_access_factory: DataAccessFactoryBase,
28
28
  transform_parameters: dict[str, Any],
29
+ is_folder: bool = False,
29
30
  ):
30
31
  """
31
32
  Init method
32
33
  :param data_access_factory: Data Access Factory
33
34
  :param transform_parameters: Transform parameters
35
+ :param is_folder: folder transform flag
34
36
  """
35
37
  self.logger = get_logger(__name__)
36
38
  # validate parameters
@@ -46,6 +48,7 @@ class AbstractTransformFileProcessor:
46
48
  # Add data access and statistics to the processor parameters
47
49
  self.transform_params = transform_parameters
48
50
  self.transform_params["data_access"] = self.data_access
51
+ self.is_folder = is_folder
49
52
 
50
53
  def process_file(self, f_name: str) -> None:
51
54
  """
@@ -58,25 +61,30 @@ class AbstractTransformFileProcessor:
58
61
  self.logger.warning("No data_access found. Returning.")
59
62
  return
60
63
  t_start = time.time()
61
- # Read source file
62
- filedata, retries = self.data_access.get_file(path=f_name)
63
- if retries > 0:
64
- self._publish_stats({"data access retries": retries})
65
- if filedata is None:
66
- self.logger.warning(f"File read resulted in None for {f_name}. Returning.")
67
- self._publish_stats({"failed_reads": 1})
68
- return
69
- self._publish_stats({"source_files": 1, "source_size": len(filedata)})
64
+ if not self.is_folder:
65
+ # Read source file only if we are processing file
66
+ filedata, retries = self.data_access.get_file(path=f_name)
67
+ if retries > 0:
68
+ self._publish_stats({"data access retries": retries})
69
+ if filedata is None:
70
+ self.logger.warning(f"File read resulted in None for {f_name}. Returning.")
71
+ self._publish_stats({"failed_reads": 1})
72
+ return
73
+ self._publish_stats({"source_files": 1, "source_size": len(filedata)})
70
74
  # Process input file
71
75
  try:
72
- # execute local processing
73
- name_extension = TransformUtils.get_file_extension(f_name)
74
76
  self.logger.debug(f"Begin transforming file {f_name}")
75
- out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata)
77
+ if not self.is_folder:
78
+ # execute local processing
79
+ out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata)
80
+ name_extension = TransformUtils.get_file_extension(f_name)
81
+ self.last_file_name = name_extension[0]
82
+ self.last_file_name_next_index = None
83
+ self.last_extension = name_extension[1]
84
+ else:
85
+ out_files, stats = self.transform.transform(folder_name=f_name)
86
+ self.last_file_name = f_name
76
87
  self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
77
- self.last_file_name = name_extension[0]
78
- self.last_file_name_next_index = None
79
- self.last_extension = name_extension[1]
80
88
  # save results
81
89
  self._submit_file(t_start=t_start, out_files=out_files, stats=stats)
82
90
  # Process unrecoverable exceptions
@@ -95,10 +103,10 @@ class AbstractTransformFileProcessor:
95
103
  the hook for them to return back locally stored data and their statistics.
96
104
  :return: None
97
105
  """
98
- if self.last_file_name is None:
106
+ if self.last_file_name is None or self.is_folder:
99
107
  # for some reason a given worker never processed anything. Happens in testing
100
108
  # when the amount of workers is greater than the amount of files
101
- self.logger.debug("skipping flush, no name for file is defined")
109
+ self.logger.debug("skipping flush, no name for file is defined or this is a folder transform")
102
110
  return
103
111
  try:
104
112
  t_start = time.time()
@@ -141,15 +149,21 @@ class AbstractTransformFileProcessor:
141
149
  )
142
150
  case 1:
143
151
  # we have exactly 1 output file
144
- file_ext = out_files[0]
145
- lfn = self.last_file_name
146
- if self.last_file_name_next_index is not None:
147
- lfn = f"{lfn}_{self.last_file_name_next_index}"
148
- output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
152
+ if self.is_folder:
153
+ # its folder
154
+ output_name = out_files[0][1]
155
+ dt = out_files[0][0]
156
+ else:
157
+ file_ext = out_files[0]
158
+ lfn = self.last_file_name
159
+ if self.last_file_name_next_index is not None:
160
+ lfn = f"{lfn}_{self.last_file_name_next_index}"
161
+ output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
162
+ dt = file_ext[0]
149
163
  self.logger.debug(
150
164
  f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}"
151
165
  )
152
- save_res, retries = self.data_access.save_file(path=output_name, data=file_ext[0])
166
+ save_res, retries = self.data_access.save_file(path=output_name, data=dt)
153
167
  if retries > 0:
154
168
  self._publish_stats({"data access retries": retries})
155
169
  if save_res is None:
@@ -159,7 +173,7 @@ class AbstractTransformFileProcessor:
159
173
  self._publish_stats(
160
174
  {
161
175
  "result_files": 1,
162
- "result_size": len(file_ext[0]),
176
+ "result_size": len(dt),
163
177
  "processing_time": time.time() - t_start,
164
178
  }
165
179
  )
@@ -176,14 +190,21 @@ class AbstractTransformFileProcessor:
176
190
  start_index = 0
177
191
  count = len(out_files)
178
192
  for index in range(count):
179
- file_ext = out_files[index]
180
- output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}"
181
- file_sizes += len(file_ext[0])
182
- self.logger.debug(
183
- f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} "
184
- f"of {count} to {output_name_indexed}"
185
- )
186
- save_res, retries = self.data_access.save_file(path=output_name_indexed, data=file_ext[0])
193
+ if self.is_folder:
194
+ # its a folder
195
+ output_name_indexed = out_files[index][1]
196
+ dt = out_files[index][0]
197
+ else:
198
+ # files
199
+ file_ext = out_files[index]
200
+ output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}"
201
+ self.logger.debug(
202
+ f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} "
203
+ f"of {count} to {output_name_indexed}"
204
+ )
205
+ dt = file_ext[0]
206
+ file_sizes += len(dt)
207
+ save_res, retries = self.data_access.save_file(path=output_name_indexed, data=dt)
187
208
  if retries > 0:
188
209
  self._publish_stats({"data access retries": retries})
189
210
  if save_res is None:
@@ -1,3 +1,15 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
1
13
  import os
2
14
  import sys
3
15
  from argparse import ArgumentParser
@@ -1,6 +1,11 @@
1
- from .table_transform_test import AbstractTableTransformTest
2
- from .binary_transform_test import AbstractBinaryTransformTest
3
- from .noop_transform import (
1
+ from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest
2
+ from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest
3
+ from data_processing.test_support.transform.noop_transform import (
4
4
  NOOPTransform,
5
- NOOPPythonTransformConfiguration,
5
+ NOOPTransformConfiguration,
6
+ NOOPPythonTransformConfiguration
6
7
  )
8
+ from data_processing.test_support.transform.noop_folder_transform import (
9
+ NOOPFolderTransform,
10
+ NOOPFolderPythonTransformConfiguration
11
+ )