easylink 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/cli.py +18 -9
- easylink/graph_components.py +12 -2
- easylink/implementation.py +2 -0
- easylink/pipeline.py +92 -34
- easylink/pipeline_graph.py +112 -27
- easylink/pipeline_schema_constants/__init__.py +3 -0
- easylink/pipeline_schema_constants/development.py +11 -2
- easylink/pipeline_schema_constants/testing.py +135 -0
- easylink/rule.py +282 -22
- easylink/runner.py +1 -0
- easylink/step.py +65 -0
- easylink/utilities/aggregator_utils.py +31 -0
- easylink/utilities/data_utils.py +1 -0
- easylink/utilities/general_utils.py +1 -0
- easylink/utilities/splitter_utils.py +71 -0
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/METADATA +1 -1
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/RECORD +21 -19
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/WHEEL +1 -1
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
"""
|
2
|
+
========================
|
3
|
+
Data Splitting Utilities
|
4
|
+
========================
|
5
|
+
|
6
|
+
This module contains utility functions for splitting datasets into smaller datasets.
|
7
|
+
One primary use case for this is to run sections of the pipeline in an embarrassingly
|
8
|
+
parallel manner.
|
9
|
+
|
10
|
+
Note that it is critical that all data splitting utility functions are definied
|
11
|
+
in this module; easylink will not be able to find them otherwise.
|
12
|
+
"""
|
13
|
+
|
14
|
+
import math
|
15
|
+
import os
|
16
|
+
|
17
|
+
import pandas as pd
|
18
|
+
from loguru import logger
|
19
|
+
|
20
|
+
|
21
|
+
def split_data_by_size(
|
22
|
+
input_files: list[str], output_dir: str, desired_chunk_size_mb: int | float
|
23
|
+
) -> None:
|
24
|
+
"""Splits the data (from a single input slot) into chunks of desired size.
|
25
|
+
|
26
|
+
This function takes all datasets from a single input slot, concatenates them,
|
27
|
+
and then splits the resulting dataset into chunks of the desired size. Note
|
28
|
+
that this will split the data as evenly as possible, but the final chunk may
|
29
|
+
be smaller than the desired size if the input data does not divide evenly; it
|
30
|
+
makes no effort to redistribute the lingering data.
|
31
|
+
|
32
|
+
Parameters
|
33
|
+
----------
|
34
|
+
input_files
|
35
|
+
A list of input file paths to be concatenated and split.
|
36
|
+
output_dir
|
37
|
+
The directory where the resulting chunks will be saved.
|
38
|
+
desired_chunk_size_mb
|
39
|
+
The desired size of each chunk, in megabytes.
|
40
|
+
"""
|
41
|
+
|
42
|
+
# concatenate all input files
|
43
|
+
df = pd.DataFrame()
|
44
|
+
input_file_size_mb = 0
|
45
|
+
for file in input_files:
|
46
|
+
input_file_size_mb += os.path.getsize(file) / 1024**2
|
47
|
+
tmp = pd.read_parquet(file)
|
48
|
+
df = pd.concat([df, tmp], ignore_index=True)
|
49
|
+
|
50
|
+
# divide df into num_chunks and save each one out
|
51
|
+
num_chunks = math.ceil(input_file_size_mb / desired_chunk_size_mb)
|
52
|
+
chunk_size = math.ceil(len(df) / num_chunks)
|
53
|
+
if num_chunks == 1:
|
54
|
+
logger.info(f"Input data is already smaller than desired chunk size; not splitting")
|
55
|
+
else:
|
56
|
+
logger.info(
|
57
|
+
f"Splitting a {round(input_file_size_mb, 2)} MB dataset ({len(df)} rows) into "
|
58
|
+
f"into {num_chunks} chunks of size ~{desired_chunk_size_mb} MB each"
|
59
|
+
)
|
60
|
+
for i in range(num_chunks):
|
61
|
+
start = i * chunk_size
|
62
|
+
end = (i + 1) * chunk_size
|
63
|
+
chunk = df.iloc[start:end]
|
64
|
+
chunk_dir = os.path.join(output_dir, f"chunk_{i}")
|
65
|
+
if not os.path.exists(chunk_dir):
|
66
|
+
os.makedirs(chunk_dir)
|
67
|
+
logger.debug(
|
68
|
+
f"Writing out chunk {i+1}/{num_chunks} (rows {chunk.index[0]} to "
|
69
|
+
f"{chunk.index[-1]})"
|
70
|
+
)
|
71
|
+
chunk.to_parquet(os.path.join(chunk_dir, "result.parquet"))
|
@@ -1,22 +1,22 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
4
|
-
easylink/cli.py,sha256=
|
3
|
+
easylink/_version.py,sha256=YpKDcdV7CqL8n45u267wKtyloM13FSVbOdrqgNZnSLM,22
|
4
|
+
easylink/cli.py,sha256=ARSKAljepNOEYd1VCS_QqBJQIBLzE3IgKiOb5-OROdY,6380
|
5
5
|
easylink/configuration.py,sha256=Ire2pMZNZ6wtSwhcWnQpYa-snX4KrhXgovlQwQ2Wxf4,12530
|
6
|
-
easylink/graph_components.py,sha256=
|
7
|
-
easylink/implementation.py,sha256=
|
6
|
+
easylink/graph_components.py,sha256=U6gbKjQVTBftdOGlH-oOKS6o0dyS88IL6MCpI_mhv3s,12354
|
7
|
+
easylink/implementation.py,sha256=AwGl5YCKCSQo91owWj-gg9_5lBz7H_4q2z7jF0BhXs4,8992
|
8
8
|
easylink/implementation_metadata.yaml,sha256=VvlEu3Dvlmeh1MpzeYx91j22GiV-9mu3hZP5yVuW04o,6763
|
9
|
-
easylink/pipeline.py,sha256=
|
10
|
-
easylink/pipeline_graph.py,sha256=
|
9
|
+
easylink/pipeline.py,sha256=EyCXv5p9WzTqcndXK6ukBJE6jY_fWIP_DGZQUl1wRcY,12284
|
10
|
+
easylink/pipeline_graph.py,sha256=vsY6nW_iEwZCNf_N_3CsixsKBUy_5JxGEi61-1Q-KAw,22842
|
11
11
|
easylink/pipeline_schema.py,sha256=ckvA4deRYalY5dLLbJDrO_pKttMuWnEUvSn5fSdu4jc,5900
|
12
|
-
easylink/rule.py,sha256=
|
13
|
-
easylink/runner.py,sha256=
|
14
|
-
easylink/step.py,sha256=
|
12
|
+
easylink/rule.py,sha256=W97LMI-vkEPipJbnSZLn2BxfYfFtvzGTKzq6YgDVri0,19913
|
13
|
+
easylink/runner.py,sha256=k9ICTToHj2xr6MGIuvlWf6YMeZ47UGgseaMByMgUGac,6271
|
14
|
+
easylink/step.py,sha256=tTlDbhtjd7vkKmsnq622WnwQgBAdTN1dapUJqhUlPjA,65664
|
15
15
|
easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
|
16
16
|
easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
|
17
|
-
easylink/pipeline_schema_constants/__init__.py,sha256=
|
18
|
-
easylink/pipeline_schema_constants/development.py,sha256
|
19
|
-
easylink/pipeline_schema_constants/testing.py,sha256=
|
17
|
+
easylink/pipeline_schema_constants/__init__.py,sha256=RVUncdInRvafu10hbf7J9QQv7cE_pg3ylw_C0v1uIOY,684
|
18
|
+
easylink/pipeline_schema_constants/development.py,sha256=-F2xaht9u66oJsXyJ8-9Mnx0PoibVZNpTC6ReRykD9w,11280
|
19
|
+
easylink/pipeline_schema_constants/testing.py,sha256=icg7Vx0t8Wnic_Bx8tGkYB5wlZmrBqHYeQSRc9mR0Lo,10704
|
20
20
|
easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
|
21
21
|
easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
|
22
22
|
easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
|
@@ -36,13 +36,15 @@ easylink/steps/dev/r/README.md,sha256=dPjZdDTqcJsZCiwhddzlOj1ob0P7YocZUNFrLIGM1-
|
|
36
36
|
easylink/steps/dev/r/dummy_step.R,sha256=1TWZY8CEkT6gavrulBxFsKbDSKJJjk0NtJrGH7TIikE,4975
|
37
37
|
easylink/steps/dev/r/r-image.def,sha256=LrhXlt0C3k7d_VJWopRPEVARnFWSuq_oILlwo7g03bE,627
|
38
38
|
easylink/utilities/__init__.py,sha256=EBk0rvRPZqwqzIqdVo8jkpSiZFFnj_fHaRB-P6EuCmk,59
|
39
|
-
easylink/utilities/
|
40
|
-
easylink/utilities/
|
39
|
+
easylink/utilities/aggregator_utils.py,sha256=7-zg7znkUow3SuwbqOAX1H0j2GzqU1RjLs2B-hiQWls,971
|
40
|
+
easylink/utilities/data_utils.py,sha256=D1Srj_2Ol5mAVt_D8k8fpm5E-GU8s6k1uKLatcQ1Oeo,1597
|
41
|
+
easylink/utilities/general_utils.py,sha256=IM78EToICkmkZX1pvYsU6uZnVvXYDmS26H9Tjmg0XCM,3293
|
41
42
|
easylink/utilities/paths.py,sha256=yl0cuWChJmB6YKMCQavTKw9jIl-VQhH6cnsM6D5c0Zk,599
|
42
43
|
easylink/utilities/spark.smk,sha256=tQ7RArNQzhjbaBQQcRORB4IxxkuDx4gPHUBcWHDYJ_U,5795
|
44
|
+
easylink/utilities/splitter_utils.py,sha256=riz3rflTrbkQ8uqMaqmXCY1BaWvgdxGzl8WN7Lb7eO8,2601
|
43
45
|
easylink/utilities/validation_utils.py,sha256=qOgn1n3_m5blFN7eHJ9MbOt5DkFA6DWucAOUAjvGvco,764
|
44
|
-
easylink-0.1.
|
45
|
-
easylink-0.1.
|
46
|
-
easylink-0.1.
|
47
|
-
easylink-0.1.
|
48
|
-
easylink-0.1.
|
46
|
+
easylink-0.1.7.dist-info/METADATA,sha256=tNiHPs5mHZUjAYO4hHpBS8k-26MgUZZe_g6W6GXuVV8,2804
|
47
|
+
easylink-0.1.7.dist-info/WHEEL,sha256=nn6H5-ilmfVryoAQl3ZQ2l8SH5imPWFpm1A5FgEuFV4,91
|
48
|
+
easylink-0.1.7.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
49
|
+
easylink-0.1.7.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
50
|
+
easylink-0.1.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|