data-prep-toolkit 0.2.2.dev1__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/METADATA +33 -1
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/RECORD +34 -28
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/WHEEL +1 -1
- data_processing/data_access/data_access.py +4 -1
- data_processing/data_access/data_access_local.py +0 -11
- data_processing/data_access/data_access_s3.py +0 -11
- data_processing/runtime/pure_python/transform_file_processor.py +9 -3
- data_processing/runtime/pure_python/transform_orchestrator.py +30 -17
- data_processing/runtime/pure_python/transform_runtime.py +9 -1
- data_processing/runtime/transform_file_processor.py +53 -32
- data_processing/test_support/data_access/data_access_factory_test.py +12 -0
- data_processing/test_support/transform/__init__.py +9 -4
- data_processing/test_support/transform/noop_folder_transform.py +105 -0
- data_processing/test_support/transform/noop_transform.py +3 -3
- data_processing/transform/__init__.py +2 -0
- data_processing/transform/abstract_transform.py +16 -0
- data_processing/transform/binary_transform.py +3 -2
- data_processing/transform/folder_transform.py +40 -0
- data_processing/transform/transform_configuration.py +3 -3
- data_processing/utils/multilock.py +160 -0
- data_processing/utils/unrecoverable.py +13 -0
- data_processing_ray/runtime/ray/transform_file_processor.py +1 -0
- data_processing_ray/runtime/ray/transform_orchestrator.py +18 -10
- data_processing_ray/runtime/ray/transform_runtime.py +9 -1
- data_processing_ray/test_support/transform/__init__.py +1 -0
- data_processing_ray/test_support/transform/noop_folder_transform.py +56 -0
- data_processing_ray/test_support/transform/noop_transform.py +1 -3
- data_processing_spark/runtime/spark/runtime_configuration.py +13 -0
- data_processing_spark/runtime/spark/transform_file_processor.py +4 -1
- data_processing_spark/runtime/spark/transform_orchestrator.py +78 -15
- data_processing_spark/runtime/spark/transform_runtime.py +24 -6
- data_processing_spark/test_support/transform/__init__.py +1 -0
- data_processing_spark/test_support/transform/noop_folder_transform.py +53 -0
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_toolkit
|
|
3
|
-
Version: 0.2.2.
|
|
3
|
+
Version: 0.2.2.dev2
|
|
4
4
|
Summary: Data Preparation Toolkit Library for Ray and Python
|
|
5
5
|
Author-email: Maroun Touma <touma@us.ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -30,4 +30,36 @@ Requires-Dist: pillow>=10.3.0; extra == "ray"
|
|
|
30
30
|
Provides-Extra: spark
|
|
31
31
|
Requires-Dist: pyspark>=3.5.2; extra == "spark"
|
|
32
32
|
Requires-Dist: psutil>=6.0.0; extra == "spark"
|
|
33
|
+
Requires-Dist: PyYAML>=6.0.2; extra == "spark"
|
|
34
|
+
|
|
35
|
+
# Data Processing Library
|
|
36
|
+
This provides a python framework for developing _transforms_
|
|
37
|
+
on data stored in files - currently parquet files are supported -
|
|
38
|
+
and running them in a [ray](https://www.ray.io/) cluster.
|
|
39
|
+
Data files may be stored in the local file system or COS/S3.
|
|
40
|
+
For more details see the [documentation](../doc/overview.md).
|
|
41
|
+
|
|
42
|
+
### Virtual Environment
|
|
43
|
+
The project uses `pyproject.toml` and a Makefile for operations.
|
|
44
|
+
To do development you should establish the virtual environment
|
|
45
|
+
```shell
|
|
46
|
+
make venv
|
|
47
|
+
```
|
|
48
|
+
and then either activate
|
|
49
|
+
```shell
|
|
50
|
+
source venv/bin/activate
|
|
51
|
+
```
|
|
52
|
+
or set up your IDE to use the venv directory when developing in this project
|
|
53
|
+
|
|
54
|
+
## Library Artifact Build and Publish
|
|
55
|
+
To test, build and publish the library
|
|
56
|
+
```shell
|
|
57
|
+
make test build publish
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
To up the version number, edit the Makefile to change VERSION and rerun
|
|
61
|
+
the above. This will require committing both the `Makefile` and the
|
|
62
|
+
autotmatically updated `pyproject.toml` file.
|
|
63
|
+
|
|
64
|
+
|
|
33
65
|
|
|
@@ -1,72 +1,78 @@
|
|
|
1
1
|
data_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
data_processing/data_access/__init__.py,sha256=996nZjaFiXZ-Zu648oC3eGlvZPTu-DTgsqvlKY4y4Ck,496
|
|
3
3
|
data_processing/data_access/arrow_s3.py,sha256=0wxYFrw1MSsntO0IbGUGaBuiB366cIAmpsWvycenU_4,9724
|
|
4
|
-
data_processing/data_access/data_access.py,sha256=
|
|
4
|
+
data_processing/data_access/data_access.py,sha256=hfERzfigBLgYoq5hsODcgsOMgh6lKISAGy_LP76Y2go,18813
|
|
5
5
|
data_processing/data_access/data_access_factory.py,sha256=Hdm06HBs_AIjmU4Aqyd_8klZEty57MbjueGlF-8basw,12339
|
|
6
6
|
data_processing/data_access/data_access_factory_base.py,sha256=dY_69ic4hKCQWSbRPNqxTRs8gcBhbW9rbvcAeeq0iFk,5685
|
|
7
|
-
data_processing/data_access/data_access_local.py,sha256=
|
|
8
|
-
data_processing/data_access/data_access_s3.py,sha256=
|
|
7
|
+
data_processing/data_access/data_access_local.py,sha256=5XozY_r_44ExjbFEPAzaIJDNEx3AKSk8h7zmf30Q2N0,8799
|
|
8
|
+
data_processing/data_access/data_access_s3.py,sha256=6cPkLY4-yOhtoCryDDXbEWq4SHJpHEh2_VTnBbsQgsk,7815
|
|
9
9
|
data_processing/data_access/snapshot_utils.py,sha256=5iT0sP5ZkXQpZmHQh_5Do2W846Xc04-6GPmociqxmAY,1207
|
|
10
10
|
data_processing/runtime/__init__.py,sha256=E9jeZ28v-TOGGB5O91GZ9GG9dP86bK02-o4qWPj8TNw,389
|
|
11
11
|
data_processing/runtime/execution_configuration.py,sha256=9EKln8O0bjKRbb7GysegxNQ6t9stI8GqbFIxscPYeak,3218
|
|
12
12
|
data_processing/runtime/runtime_configuration.py,sha256=uAKr-aawKcr_5Yq9fG8SktHI-Z1nCoVsu84UF2NiV9U,2769
|
|
13
|
-
data_processing/runtime/transform_file_processor.py,sha256
|
|
13
|
+
data_processing/runtime/transform_file_processor.py,sha256=SQ_ICPPXCLcBr_X2lmOLiddyjCfz4CCaFtvatu8tcSc,10898
|
|
14
14
|
data_processing/runtime/transform_launcher.py,sha256=34144dkMHSt58PQ1wbAKnELnhi9lubJcNCBopWoLRqo,4593
|
|
15
15
|
data_processing/runtime/pure_python/__init__.py,sha256=s0smp908uzeT1cd_6EERaI72WJqiOJ50SDC5yOgDBvc,744
|
|
16
16
|
data_processing/runtime/pure_python/execution_configuration.py,sha256=C0DCP3YR9sirFQ1Zqz4KcZ03WA5FLXl_kky9PBHREYQ,2617
|
|
17
17
|
data_processing/runtime/pure_python/runtime_configuration.py,sha256=a4vSY98HfRm2p6pIBNpT-wDEtqRX2RUbRpFYtXyq5Kk,1710
|
|
18
|
-
data_processing/runtime/pure_python/transform_file_processor.py,sha256=
|
|
18
|
+
data_processing/runtime/pure_python/transform_file_processor.py,sha256=jN-8OnPgctfY80W13WOnsdxCpDHi4HAgv0bM0Ysx-H4,4586
|
|
19
19
|
data_processing/runtime/pure_python/transform_invoker.py,sha256=lAG7tfyJyNqtwRB15-db4HJOQsBhT6JahLmjUFQFCRk,5192
|
|
20
20
|
data_processing/runtime/pure_python/transform_launcher.py,sha256=BDctJnYlR9OVzGCzMwg2cEuGdnV3E9fvhUgoyslvK8k,2447
|
|
21
|
-
data_processing/runtime/pure_python/transform_orchestrator.py,sha256=
|
|
22
|
-
data_processing/runtime/pure_python/transform_runtime.py,sha256=
|
|
21
|
+
data_processing/runtime/pure_python/transform_orchestrator.py,sha256=YtELV-ENqqklSGr1N2yqNaSsQZlR0o-lk16PNxDAHYE,10067
|
|
22
|
+
data_processing/runtime/pure_python/transform_runtime.py,sha256=3fz3_c_pYJkShXj_EuCGg9ieIb9ysfc_yhttsSjSDi4,2806
|
|
23
23
|
data_processing/test_support/__init__.py,sha256=O4lySih15vkOYUSa3uhTaoYw0RrV4rM_sUd691JEuVU,83
|
|
24
24
|
data_processing/test_support/abstract_test.py,sha256=gZ51wnWITEAyb8BzA2WFCM0quJBxQrlD7WBwUfIsWEA,12875
|
|
25
25
|
data_processing/test_support/data_access/__init__.py,sha256=q4xqedYF-a1Kkk64i1ToiEW_SZQDMw66mV1s-Owsqq0,69
|
|
26
|
-
data_processing/test_support/data_access/data_access_factory_test.py,sha256=
|
|
26
|
+
data_processing/test_support/data_access/data_access_factory_test.py,sha256=AE1EWibaNtu3svA4s94GFd0RzJtdTyCsBYKnPM0U2Ww,3328
|
|
27
27
|
data_processing/test_support/launch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
28
|
data_processing/test_support/launch/transform_test.py,sha256=vcqLeZZPSiJ0HPkr0qaBSLGhzKMaXeErtQdKu6fsiTs,5435
|
|
29
|
-
data_processing/test_support/transform/__init__.py,sha256=
|
|
29
|
+
data_processing/test_support/transform/__init__.py,sha256=zn8RWuDBFS1JNL1QCVGrGCBEMn6bcEqTaOsWp9XnGTg,502
|
|
30
30
|
data_processing/test_support/transform/binary_transform_test.py,sha256=ea3ipfxUuOYYxZaNSV-2ad2Q2Oqg32uenMAqxHSbDB4,4573
|
|
31
|
-
data_processing/test_support/transform/
|
|
31
|
+
data_processing/test_support/transform/noop_folder_transform.py,sha256=0V74ZVCb6DgOKlgtDCsYPDeYIg6xJm1vLnCiTlAjqeM,4410
|
|
32
|
+
data_processing/test_support/transform/noop_transform.py,sha256=9OnvZ5vhfveTjWSILnT8FyhvEL57yTIc_3wFi1R8YFw,5697
|
|
32
33
|
data_processing/test_support/transform/table_transform_test.py,sha256=nTlomV3-X9kkiWaR7AF_qFp5T2fWWSoDCEi8p5SpqtI,4619
|
|
33
|
-
data_processing/transform/__init__.py,sha256=
|
|
34
|
-
data_processing/transform/
|
|
34
|
+
data_processing/transform/__init__.py,sha256=RqHj9FCbkKsYqeZRdFLiFapnJkWaj0AhPAdB0-Fy_BE,496
|
|
35
|
+
data_processing/transform/abstract_transform.py,sha256=nSPaJiu-M1Ptzmixw3KgSlKXJyeFp7J8PIGkt9ULiog,738
|
|
36
|
+
data_processing/transform/binary_transform.py,sha256=oM_v5_HsgM914FamzWP_IgpXxp4cxSGzer9io-G2c8k,2982
|
|
37
|
+
data_processing/transform/folder_transform.py,sha256=XtdSHlvs4qbVTfXAWIVbpPtSnQjMQOYvJq8WZHeSdtU,1910
|
|
35
38
|
data_processing/transform/table_transform.py,sha256=BliQKz7ShFTpXRe9WBYAeRM7n-AhEWYj5kb05QAxS2c,6281
|
|
36
|
-
data_processing/transform/transform_configuration.py,sha256=
|
|
39
|
+
data_processing/transform/transform_configuration.py,sha256=4hLPy78eMpFw0yKQy2ICZcFicUz--kvgUFAXorUkkwE,4395
|
|
37
40
|
data_processing/transform/transform_statistics.py,sha256=eeInqfFFxhmQoaMJXXaOq_zY49mmQYAA_9xmdwi5vDw,1437
|
|
38
41
|
data_processing/utils/__init__.py,sha256=lL25FenOmye6EBtcQLbczJJ3NtrGlmW9v0hDQfKrCno,584
|
|
39
42
|
data_processing/utils/cli_utils.py,sha256=vjs6qOxxYetEYCYx_8jX_KBVwA7z_iBYFPhyPyucCGM,3135
|
|
40
43
|
data_processing/utils/config.py,sha256=GZgmmvjiSEL86Vk_wx7lJ-Y27ItQXujthb5mcdV4FMA,1695
|
|
41
44
|
data_processing/utils/log.py,sha256=t7S_ITjWJicEaCPcSk3cKKXyzEe1u21Ov5aQz3CzrZQ,2226
|
|
45
|
+
data_processing/utils/multilock.py,sha256=ra5Qy61zqmdPNhAqYNBXLG6L6LjTxTyTpDLMOuc99x0,5734
|
|
42
46
|
data_processing/utils/params_utils.py,sha256=oAKY3wC8b17rDUJGqX19-rAQHDc9SQn1ksTeo3RFgCY,6668
|
|
43
47
|
data_processing/utils/pipinstaller.py,sha256=PxFNwEy8v4FqjwYgrPhH0UTrCgsJvM5WAE2fKylsk2Q,2511
|
|
44
48
|
data_processing/utils/transform_configuration.json,sha256=6YBw0Hk2mokY6JBn1kR6L9AkV_yivbFrpSoHecAJp9o,4562
|
|
45
49
|
data_processing/utils/transform_configurator.py,sha256=9OHSCQ8rFSoDdMW6ZCHYdNe6thRwV9zOaRPnLkWNMYE,3601
|
|
46
50
|
data_processing/utils/transform_utils.py,sha256=KGNioN35B1i1h-MIsfm3QvXLlU1aGXimheva7NbUhMM,8496
|
|
47
|
-
data_processing/utils/unrecoverable.py,sha256=
|
|
51
|
+
data_processing/utils/unrecoverable.py,sha256=cbF74AGK1IdRor_L1w_hPwglV_b2blP6Ad4ET79xrl0,831
|
|
48
52
|
data_processing_ray/runtime/ray/__init__.py,sha256=vjQOvb_OJNq3c1F_tG3WjO-pciY77Z1lETO2Ha_GVbw,784
|
|
49
53
|
data_processing_ray/runtime/ray/execution_configuration.py,sha256=C9YFixlATr7PPpkVQ0WzjCCPTWFuP80W2rnzY1bbp5I,4628
|
|
50
54
|
data_processing_ray/runtime/ray/ray_utils.py,sha256=eDPm-pybPOELjKkvoz3l-qFU-k1Iwh-giGlXULiZjEk,10212
|
|
51
55
|
data_processing_ray/runtime/ray/runtime_configuration.py,sha256=js9dXwdxjYbSigMC49F07XmbLjmj9HiipPE6BDaIGfA,1691
|
|
52
|
-
data_processing_ray/runtime/ray/transform_file_processor.py,sha256=
|
|
56
|
+
data_processing_ray/runtime/ray/transform_file_processor.py,sha256=vciH2OViK_RLSzze34kaLkRNi5Bg2JmH-KQAYx5zoo0,2388
|
|
53
57
|
data_processing_ray/runtime/ray/transform_invoker.py,sha256=apfH8uilpm9sJ4IpHgiNdIzcH_IHGQba5fui4cCfolk,4026
|
|
54
58
|
data_processing_ray/runtime/ray/transform_launcher.py,sha256=oxI3MFZI_-LzTwHbrHBIUqJ0htnliKBuALt86qijRwU,4304
|
|
55
|
-
data_processing_ray/runtime/ray/transform_orchestrator.py,sha256=
|
|
56
|
-
data_processing_ray/runtime/ray/transform_runtime.py,sha256=
|
|
59
|
+
data_processing_ray/runtime/ray/transform_orchestrator.py,sha256=ZY4Wu7luS3kiUWAsTPZAJg4upyNRwGl2Ez0K832vAIw,7552
|
|
60
|
+
data_processing_ray/runtime/ray/transform_runtime.py,sha256=s-9lrPBg-ifRRYJPJfbL8P9cOpYdvQKcairxpaIbB4E,2779
|
|
57
61
|
data_processing_ray/runtime/ray/transform_statistics.py,sha256=cxrSQVnzRBCGS68IoiVGLoRBWBxPBSFFMDiT29FNt0g,3749
|
|
58
|
-
data_processing_ray/test_support/transform/__init__.py,sha256=
|
|
59
|
-
data_processing_ray/test_support/transform/
|
|
62
|
+
data_processing_ray/test_support/transform/__init__.py,sha256=P9stuAcnpjeGrJ8HJKRTuT2BKwLtG_3uG1UCYpnkw8c,213
|
|
63
|
+
data_processing_ray/test_support/transform/noop_folder_transform.py,sha256=Sw5heLb4EyPYpTjoCW1K53FOt7BH1Zob2bLZUQbwmfs,2075
|
|
64
|
+
data_processing_ray/test_support/transform/noop_transform.py,sha256=97o8oizjhBR_dqdrXhwss5QvW8-FZ_QYfTQGXpyUpGk,1564
|
|
60
65
|
data_processing_spark/runtime/spark/__init__.py,sha256=bhY1xI9lL0GR2v1APahlhC5sh5rdVcGhQbWN4yoXApw,1233
|
|
61
66
|
data_processing_spark/runtime/spark/execution_configuration.py,sha256=BqxUlpXFdHRK-csO2jaJJtktyKbcMtjIn3sjAPBfO58,3643
|
|
62
|
-
data_processing_spark/runtime/spark/runtime_configuration.py,sha256=
|
|
63
|
-
data_processing_spark/runtime/spark/transform_file_processor.py,sha256=
|
|
67
|
+
data_processing_spark/runtime/spark/runtime_configuration.py,sha256=lAx7aGLq0_-ySJSc_fx5ek1bIgLcGUgfmAM-HcVLkqc,2377
|
|
68
|
+
data_processing_spark/runtime/spark/transform_file_processor.py,sha256=E0JEoEvAqdygYBHkvlm5kdkNbPO4aPQljfk833_Fqv8,2682
|
|
64
69
|
data_processing_spark/runtime/spark/transform_launcher.py,sha256=1PZ-N4Wy2Qqiqr2z9S1xV88cNsAoHrmmuPadiOakJLM,2479
|
|
65
|
-
data_processing_spark/runtime/spark/transform_orchestrator.py,sha256=
|
|
66
|
-
data_processing_spark/runtime/spark/transform_runtime.py,sha256=
|
|
67
|
-
data_processing_spark/test_support/transform/__init__.py,sha256=
|
|
70
|
+
data_processing_spark/runtime/spark/transform_orchestrator.py,sha256=64lme-KZz-a1CQUThYnLtx8m99VMlYv8MJB7dHCIAo4,9470
|
|
71
|
+
data_processing_spark/runtime/spark/transform_runtime.py,sha256=je27rTRdd-5Wtd8nc8ogUwxZqUd4ZgP5122tA8JtTKA,3258
|
|
72
|
+
data_processing_spark/test_support/transform/__init__.py,sha256=FQJyj7z1hXQynngMVQlCTJxTh2bdc4jN4220CBmLTqE,872
|
|
73
|
+
data_processing_spark/test_support/transform/noop_folder_transform.py,sha256=z0jXCVKJYHPqB9ZTfUxnQkUVDnmfWjvss4_I3QZ8JZ4,2187
|
|
68
74
|
data_processing_spark/test_support/transform/noop_transform.py,sha256=0FR3o-LnXf-UFS5gU0j-i4LVlw1mHDxGaPI40dkkIKY,1694
|
|
69
|
-
data_prep_toolkit-0.2.2.
|
|
70
|
-
data_prep_toolkit-0.2.2.
|
|
71
|
-
data_prep_toolkit-0.2.2.
|
|
72
|
-
data_prep_toolkit-0.2.2.
|
|
75
|
+
data_prep_toolkit-0.2.2.dev2.dist-info/METADATA,sha256=XgskYjPA5pddqDgaBrPpe1IeqOpHPB2WscNM4dRh7XQ,2240
|
|
76
|
+
data_prep_toolkit-0.2.2.dev2.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
77
|
+
data_prep_toolkit-0.2.2.dev2.dist-info/top_level.txt,sha256=XGMDmY55_pe5KeRWvO0un9a640e2v99tzbBBtjNybPM,58
|
|
78
|
+
data_prep_toolkit-0.2.2.dev2.dist-info/RECORD,,
|
|
@@ -358,7 +358,10 @@ class DataAccess:
|
|
|
358
358
|
:param path: input file location
|
|
359
359
|
:return: output file location
|
|
360
360
|
"""
|
|
361
|
-
|
|
361
|
+
if self.get_output_folder() is None:
|
|
362
|
+
self.logger.error("Get out put location. S3 configuration is not provided, returning None")
|
|
363
|
+
return None
|
|
364
|
+
return path.replace(self.get_input_folder(), self.get_output_folder())
|
|
362
365
|
|
|
363
366
|
def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
|
|
364
367
|
"""
|
|
@@ -130,17 +130,6 @@ class DataAccessLocal(DataAccess):
|
|
|
130
130
|
logger.error(f"Error reading table from {path}: {e}")
|
|
131
131
|
return None, 0
|
|
132
132
|
|
|
133
|
-
def get_output_location(self, path: str) -> str:
|
|
134
|
-
"""
|
|
135
|
-
Get output location based on input
|
|
136
|
-
:param path: input file location
|
|
137
|
-
:return: output file location
|
|
138
|
-
"""
|
|
139
|
-
if self.output_folder is None:
|
|
140
|
-
logger.error("Get output location. local configuration is not defined, returning None")
|
|
141
|
-
return None
|
|
142
|
-
return path.replace(self.input_folder, self.output_folder)
|
|
143
|
-
|
|
144
133
|
def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
|
|
145
134
|
"""
|
|
146
135
|
Saves a pyarrow table to a file and returns information about the operation.
|
|
@@ -126,17 +126,6 @@ class DataAccessS3(DataAccess):
|
|
|
126
126
|
self.logger.error(f"Exception reading table {path} from S3 - {e}")
|
|
127
127
|
return None, 0
|
|
128
128
|
|
|
129
|
-
def get_output_location(self, path: str) -> str:
|
|
130
|
-
"""
|
|
131
|
-
Get output location based on input
|
|
132
|
-
:param path: input file location
|
|
133
|
-
:return: output file location
|
|
134
|
-
"""
|
|
135
|
-
if self.output_folder is None:
|
|
136
|
-
self.logger.error("Get out put location. S3 configuration is not provided, returning None")
|
|
137
|
-
return None
|
|
138
|
-
return path.replace(self.input_folder, self.output_folder)
|
|
139
|
-
|
|
140
129
|
def save_table(self, path: str, table: pyarrow.Table) -> tuple[int, dict[str, Any], int]:
|
|
141
130
|
"""
|
|
142
131
|
Save table to a given location
|
|
@@ -14,7 +14,7 @@ from typing import Any
|
|
|
14
14
|
|
|
15
15
|
from data_processing.data_access import DataAccessFactoryBase
|
|
16
16
|
from data_processing.runtime import AbstractTransformFileProcessor
|
|
17
|
-
from data_processing.transform import
|
|
17
|
+
from data_processing.transform import AbstractTransform, TransformStatistics
|
|
18
18
|
from data_processing.utils import UnrecoverableException
|
|
19
19
|
|
|
20
20
|
|
|
@@ -28,7 +28,8 @@ class PythonTransformFileProcessor(AbstractTransformFileProcessor):
|
|
|
28
28
|
data_access_factory: DataAccessFactoryBase,
|
|
29
29
|
statistics: TransformStatistics,
|
|
30
30
|
transform_params: dict[str, Any],
|
|
31
|
-
transform_class: type[
|
|
31
|
+
transform_class: type[AbstractTransform],
|
|
32
|
+
is_folder: bool,
|
|
32
33
|
):
|
|
33
34
|
"""
|
|
34
35
|
Init method
|
|
@@ -36,11 +37,13 @@ class PythonTransformFileProcessor(AbstractTransformFileProcessor):
|
|
|
36
37
|
:param statistics - reference to statistics class
|
|
37
38
|
:param transform_params - transform parameters
|
|
38
39
|
:param transform_class: transform class
|
|
40
|
+
:param is_folder: folder transform flag
|
|
39
41
|
"""
|
|
40
42
|
# invoke superclass
|
|
41
43
|
super().__init__(
|
|
42
44
|
data_access_factory=data_access_factory,
|
|
43
45
|
transform_parameters=dict(transform_params),
|
|
46
|
+
is_folder=is_folder,
|
|
44
47
|
)
|
|
45
48
|
self.transform_params["statistics"] = statistics
|
|
46
49
|
# Create local processor
|
|
@@ -65,17 +68,20 @@ class PythonPoolTransformFileProcessor(AbstractTransformFileProcessor):
|
|
|
65
68
|
self,
|
|
66
69
|
data_access_factory: DataAccessFactoryBase,
|
|
67
70
|
transform_params: dict[str, Any],
|
|
68
|
-
transform_class: type[
|
|
71
|
+
transform_class: type[AbstractTransform],
|
|
72
|
+
is_folder: bool
|
|
69
73
|
):
|
|
70
74
|
"""
|
|
71
75
|
Init method
|
|
72
76
|
:param data_access_factory - data access factory
|
|
73
77
|
:param transform_params - transform parameters
|
|
74
78
|
:param transform_class: transform class
|
|
79
|
+
:param is_folder: folder tranform flag
|
|
75
80
|
"""
|
|
76
81
|
super().__init__(
|
|
77
82
|
data_access_factory=data_access_factory,
|
|
78
83
|
transform_parameters=dict(transform_params),
|
|
84
|
+
is_folder=is_folder,
|
|
79
85
|
)
|
|
80
86
|
# Add data access and statistics to the processor parameters
|
|
81
87
|
self.transform_params["data_access"] = self.data_access
|
|
@@ -24,14 +24,13 @@ from data_processing.runtime.pure_python import (
|
|
|
24
24
|
PythonTransformFileProcessor,
|
|
25
25
|
PythonTransformRuntimeConfiguration,
|
|
26
26
|
)
|
|
27
|
-
from data_processing.transform import
|
|
27
|
+
from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform
|
|
28
28
|
from data_processing.utils import GB, get_logger
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
logger = get_logger(__name__)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@staticmethod
|
|
35
34
|
def _execution_resources() -> dict[str, Any]:
|
|
36
35
|
"""
|
|
37
36
|
Get Execution resource
|
|
@@ -49,7 +48,6 @@ def _execution_resources() -> dict[str, Any]:
|
|
|
49
48
|
}
|
|
50
49
|
|
|
51
50
|
|
|
52
|
-
|
|
53
51
|
def orchestrate(
|
|
54
52
|
data_access_factory: DataAccessFactoryBase,
|
|
55
53
|
runtime_config: PythonTransformRuntimeConfiguration,
|
|
@@ -74,15 +72,21 @@ def orchestrate(
|
|
|
74
72
|
return 1
|
|
75
73
|
# create additional execution parameters
|
|
76
74
|
runtime = runtime_config.create_transform_runtime()
|
|
75
|
+
is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
|
|
77
76
|
try:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
logger.
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
77
|
+
if is_folder:
|
|
78
|
+
# folder transform
|
|
79
|
+
files = runtime.get_folders(data_access=data_access)
|
|
80
|
+
logger.info(f"Number of folders is {len(files)}")
|
|
81
|
+
else:
|
|
82
|
+
# Get files to process
|
|
83
|
+
files, profile, retries = data_access.get_files_to_process()
|
|
84
|
+
if len(files) == 0:
|
|
85
|
+
logger.error("No input files to process - exiting")
|
|
86
|
+
return 0
|
|
87
|
+
if retries > 0:
|
|
88
|
+
statistics.add_stats({"data access retries": retries})
|
|
89
|
+
logger.info(f"Number of files is {len(files)}, source profile {profile}")
|
|
86
90
|
# Print interval
|
|
87
91
|
print_interval = int(len(files) / 100)
|
|
88
92
|
if print_interval == 0:
|
|
@@ -99,6 +103,7 @@ def orchestrate(
|
|
|
99
103
|
data_access_factory=data_access_factory, statistics=statistics, files=files
|
|
100
104
|
),
|
|
101
105
|
transform_class=runtime_config.get_transform_class(),
|
|
106
|
+
is_folder=is_folder,
|
|
102
107
|
)
|
|
103
108
|
else:
|
|
104
109
|
# using sequential execution
|
|
@@ -111,6 +116,7 @@ def orchestrate(
|
|
|
111
116
|
data_access_factory=data_access_factory, statistics=statistics, files=files
|
|
112
117
|
),
|
|
113
118
|
transform_class=runtime_config.get_transform_class(),
|
|
119
|
+
is_folder=is_folder,
|
|
114
120
|
)
|
|
115
121
|
status = "success"
|
|
116
122
|
return_code = 0
|
|
@@ -139,7 +145,8 @@ def orchestrate(
|
|
|
139
145
|
"job_input_params": input_params
|
|
140
146
|
| data_access_factory.get_input_params()
|
|
141
147
|
| execution_config.get_input_params(),
|
|
142
|
-
"execution_stats": _execution_resources() |
|
|
148
|
+
"execution_stats": _execution_resources() |
|
|
149
|
+
{"execution time, min": round((time.time() - start_time) / 60.0, 3)},
|
|
143
150
|
"job_output_stats": stats,
|
|
144
151
|
}
|
|
145
152
|
logger.debug(f"Saving job metadata: {metadata}.")
|
|
@@ -157,7 +164,8 @@ def _process_transforms(
|
|
|
157
164
|
data_access_factory: DataAccessFactoryBase,
|
|
158
165
|
statistics: TransformStatistics,
|
|
159
166
|
transform_params: dict[str, Any],
|
|
160
|
-
transform_class: type[
|
|
167
|
+
transform_class: type[AbstractTransform],
|
|
168
|
+
is_folder: bool,
|
|
161
169
|
) -> None:
|
|
162
170
|
"""
|
|
163
171
|
Process transforms sequentially
|
|
@@ -167,9 +175,8 @@ def _process_transforms(
|
|
|
167
175
|
:param data_access_factory: data access factory
|
|
168
176
|
:param transform_params - transform parameters
|
|
169
177
|
:param transform_class: transform class
|
|
178
|
+
:param is_folder: folder transform flag
|
|
170
179
|
:return: metadata for the execution
|
|
171
|
-
|
|
172
|
-
:return: None
|
|
173
180
|
"""
|
|
174
181
|
# create executor
|
|
175
182
|
executor = PythonTransformFileProcessor(
|
|
@@ -177,6 +184,7 @@ def _process_transforms(
|
|
|
177
184
|
statistics=statistics,
|
|
178
185
|
transform_params=transform_params,
|
|
179
186
|
transform_class=transform_class,
|
|
187
|
+
is_folder=is_folder,
|
|
180
188
|
)
|
|
181
189
|
# process data
|
|
182
190
|
t_start = time.time()
|
|
@@ -202,7 +210,8 @@ def _process_transforms_multiprocessor(
|
|
|
202
210
|
print_interval: int,
|
|
203
211
|
data_access_factory: DataAccessFactoryBase,
|
|
204
212
|
transform_params: dict[str, Any],
|
|
205
|
-
transform_class: type[
|
|
213
|
+
transform_class: type[AbstractTransform],
|
|
214
|
+
is_folder: bool
|
|
206
215
|
) -> TransformStatistics:
|
|
207
216
|
"""
|
|
208
217
|
Process transforms using multiprocessing pool
|
|
@@ -212,13 +221,17 @@ def _process_transforms_multiprocessor(
|
|
|
212
221
|
:param data_access_factory: data access factory
|
|
213
222
|
:param transform_params - transform parameters
|
|
214
223
|
:param transform_class: transform class
|
|
224
|
+
:param is_folder: folder transform class
|
|
215
225
|
:return: metadata for the execution
|
|
216
226
|
"""
|
|
217
227
|
# result statistics
|
|
218
228
|
statistics = TransformStatistics()
|
|
219
229
|
# create processor
|
|
220
230
|
processor = PythonPoolTransformFileProcessor(
|
|
221
|
-
data_access_factory=data_access_factory,
|
|
231
|
+
data_access_factory=data_access_factory,
|
|
232
|
+
transform_params=transform_params,
|
|
233
|
+
transform_class=transform_class,
|
|
234
|
+
is_folder=is_folder,
|
|
222
235
|
)
|
|
223
236
|
completed = 0
|
|
224
237
|
t_start = time.time()
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
|
|
13
13
|
from typing import Any
|
|
14
14
|
|
|
15
|
-
from data_processing.data_access import DataAccessFactoryBase
|
|
15
|
+
from data_processing.data_access import DataAccessFactoryBase, DataAccess
|
|
16
16
|
from data_processing.transform import TransformStatistics
|
|
17
17
|
|
|
18
18
|
|
|
@@ -28,6 +28,14 @@ class DefaultPythonTransformRuntime:
|
|
|
28
28
|
"""
|
|
29
29
|
self.params = params
|
|
30
30
|
|
|
31
|
+
def get_folders(self, data_access: DataAccess) -> list[str]:
|
|
32
|
+
"""
|
|
33
|
+
Get folders to process
|
|
34
|
+
:param data_access: data access
|
|
35
|
+
:return: list of folders to process
|
|
36
|
+
"""
|
|
37
|
+
raise NotImplemented()
|
|
38
|
+
|
|
31
39
|
def get_transform_config(
|
|
32
40
|
self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
|
|
33
41
|
) -> dict[str, Any]:
|
|
@@ -26,11 +26,13 @@ class AbstractTransformFileProcessor:
|
|
|
26
26
|
self,
|
|
27
27
|
data_access_factory: DataAccessFactoryBase,
|
|
28
28
|
transform_parameters: dict[str, Any],
|
|
29
|
+
is_folder: bool = False,
|
|
29
30
|
):
|
|
30
31
|
"""
|
|
31
32
|
Init method
|
|
32
33
|
:param data_access_factory: Data Access Factory
|
|
33
34
|
:param transform_parameters: Transform parameters
|
|
35
|
+
:param is_folder: folder transform flag
|
|
34
36
|
"""
|
|
35
37
|
self.logger = get_logger(__name__)
|
|
36
38
|
# validate parameters
|
|
@@ -46,6 +48,7 @@ class AbstractTransformFileProcessor:
|
|
|
46
48
|
# Add data access and statistics to the processor parameters
|
|
47
49
|
self.transform_params = transform_parameters
|
|
48
50
|
self.transform_params["data_access"] = self.data_access
|
|
51
|
+
self.is_folder = is_folder
|
|
49
52
|
|
|
50
53
|
def process_file(self, f_name: str) -> None:
|
|
51
54
|
"""
|
|
@@ -58,25 +61,30 @@ class AbstractTransformFileProcessor:
|
|
|
58
61
|
self.logger.warning("No data_access found. Returning.")
|
|
59
62
|
return
|
|
60
63
|
t_start = time.time()
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
64
|
+
if not self.is_folder:
|
|
65
|
+
# Read source file only if we are processing file
|
|
66
|
+
filedata, retries = self.data_access.get_file(path=f_name)
|
|
67
|
+
if retries > 0:
|
|
68
|
+
self._publish_stats({"data access retries": retries})
|
|
69
|
+
if filedata is None:
|
|
70
|
+
self.logger.warning(f"File read resulted in None for {f_name}. Returning.")
|
|
71
|
+
self._publish_stats({"failed_reads": 1})
|
|
72
|
+
return
|
|
73
|
+
self._publish_stats({"source_files": 1, "source_size": len(filedata)})
|
|
70
74
|
# Process input file
|
|
71
75
|
try:
|
|
72
|
-
# execute local processing
|
|
73
|
-
name_extension = TransformUtils.get_file_extension(f_name)
|
|
74
76
|
self.logger.debug(f"Begin transforming file {f_name}")
|
|
75
|
-
|
|
77
|
+
if not self.is_folder:
|
|
78
|
+
# execute local processing
|
|
79
|
+
out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata)
|
|
80
|
+
name_extension = TransformUtils.get_file_extension(f_name)
|
|
81
|
+
self.last_file_name = name_extension[0]
|
|
82
|
+
self.last_file_name_next_index = None
|
|
83
|
+
self.last_extension = name_extension[1]
|
|
84
|
+
else:
|
|
85
|
+
out_files, stats = self.transform.transform(folder_name=f_name)
|
|
86
|
+
self.last_file_name = f_name
|
|
76
87
|
self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
|
|
77
|
-
self.last_file_name = name_extension[0]
|
|
78
|
-
self.last_file_name_next_index = None
|
|
79
|
-
self.last_extension = name_extension[1]
|
|
80
88
|
# save results
|
|
81
89
|
self._submit_file(t_start=t_start, out_files=out_files, stats=stats)
|
|
82
90
|
# Process unrecoverable exceptions
|
|
@@ -95,10 +103,10 @@ class AbstractTransformFileProcessor:
|
|
|
95
103
|
the hook for them to return back locally stored data and their statistics.
|
|
96
104
|
:return: None
|
|
97
105
|
"""
|
|
98
|
-
if self.last_file_name is None:
|
|
106
|
+
if self.last_file_name is None or self.is_folder:
|
|
99
107
|
# for some reason a given worker never processed anything. Happens in testing
|
|
100
108
|
# when the amount of workers is greater than the amount of files
|
|
101
|
-
self.logger.debug("skipping flush, no name for file is defined")
|
|
109
|
+
self.logger.debug("skipping flush, no name for file is defined or this is a folder transform")
|
|
102
110
|
return
|
|
103
111
|
try:
|
|
104
112
|
t_start = time.time()
|
|
@@ -141,15 +149,21 @@ class AbstractTransformFileProcessor:
|
|
|
141
149
|
)
|
|
142
150
|
case 1:
|
|
143
151
|
# we have exactly 1 output file
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
152
|
+
if self.is_folder:
|
|
153
|
+
# its folder
|
|
154
|
+
output_name = out_files[0][1]
|
|
155
|
+
dt = out_files[0][0]
|
|
156
|
+
else:
|
|
157
|
+
file_ext = out_files[0]
|
|
158
|
+
lfn = self.last_file_name
|
|
159
|
+
if self.last_file_name_next_index is not None:
|
|
160
|
+
lfn = f"{lfn}_{self.last_file_name_next_index}"
|
|
161
|
+
output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
|
|
162
|
+
dt = file_ext[0]
|
|
149
163
|
self.logger.debug(
|
|
150
164
|
f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}"
|
|
151
165
|
)
|
|
152
|
-
save_res, retries = self.data_access.save_file(path=output_name, data=
|
|
166
|
+
save_res, retries = self.data_access.save_file(path=output_name, data=dt)
|
|
153
167
|
if retries > 0:
|
|
154
168
|
self._publish_stats({"data access retries": retries})
|
|
155
169
|
if save_res is None:
|
|
@@ -159,7 +173,7 @@ class AbstractTransformFileProcessor:
|
|
|
159
173
|
self._publish_stats(
|
|
160
174
|
{
|
|
161
175
|
"result_files": 1,
|
|
162
|
-
"result_size": len(
|
|
176
|
+
"result_size": len(dt),
|
|
163
177
|
"processing_time": time.time() - t_start,
|
|
164
178
|
}
|
|
165
179
|
)
|
|
@@ -176,14 +190,21 @@ class AbstractTransformFileProcessor:
|
|
|
176
190
|
start_index = 0
|
|
177
191
|
count = len(out_files)
|
|
178
192
|
for index in range(count):
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
193
|
+
if self.is_folder:
|
|
194
|
+
# its a folder
|
|
195
|
+
output_name_indexed = out_files[index][1]
|
|
196
|
+
dt = out_files[index][0]
|
|
197
|
+
else:
|
|
198
|
+
# files
|
|
199
|
+
file_ext = out_files[index]
|
|
200
|
+
output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}"
|
|
201
|
+
self.logger.debug(
|
|
202
|
+
f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} "
|
|
203
|
+
f"of {count} to {output_name_indexed}"
|
|
204
|
+
)
|
|
205
|
+
dt = file_ext[0]
|
|
206
|
+
file_sizes += len(dt)
|
|
207
|
+
save_res, retries = self.data_access.save_file(path=output_name_indexed, data=dt)
|
|
187
208
|
if retries > 0:
|
|
188
209
|
self._publish_stats({"data access retries": retries})
|
|
189
210
|
if save_res is None:
|
|
@@ -1,3 +1,15 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
1
13
|
import os
|
|
2
14
|
import sys
|
|
3
15
|
from argparse import ArgumentParser
|
|
@@ -1,6 +1,11 @@
|
|
|
1
|
-
from .table_transform_test import AbstractTableTransformTest
|
|
2
|
-
from .binary_transform_test import AbstractBinaryTransformTest
|
|
3
|
-
from .noop_transform import (
|
|
1
|
+
from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest
|
|
2
|
+
from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest
|
|
3
|
+
from data_processing.test_support.transform.noop_transform import (
|
|
4
4
|
NOOPTransform,
|
|
5
|
-
|
|
5
|
+
NOOPTransformConfiguration,
|
|
6
|
+
NOOPPythonTransformConfiguration
|
|
6
7
|
)
|
|
8
|
+
from data_processing.test_support.transform.noop_folder_transform import (
|
|
9
|
+
NOOPFolderTransform,
|
|
10
|
+
NOOPFolderPythonTransformConfiguration
|
|
11
|
+
)
|