easylink 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ """
2
+ ========================
3
+ Data Splitting Utilities
4
+ ========================
5
+
6
+ This module contains utility functions for splitting datasets into smaller datasets.
7
+ One primary use case for this is to run sections of the pipeline in an embarrassingly
8
+ parallel manner.
9
+
10
+ Note that it is critical that all data splitting utility functions are definied
11
+ in this module; easylink will not be able to find them otherwise.
12
+ """
13
+
14
+ import math
15
+ import os
16
+
17
+ import pandas as pd
18
+ from loguru import logger
19
+
20
+
21
+ def split_data_by_size(
22
+ input_files: list[str], output_dir: str, desired_chunk_size_mb: int | float
23
+ ) -> None:
24
+ """Splits the data (from a single input slot) into chunks of desired size.
25
+
26
+ This function takes all datasets from a single input slot, concatenates them,
27
+ and then splits the resulting dataset into chunks of the desired size. Note
28
+ that this will split the data as evenly as possible, but the final chunk may
29
+ be smaller than the desired size if the input data does not divide evenly; it
30
+ makes no effort to redistribute the lingering data.
31
+
32
+ Parameters
33
+ ----------
34
+ input_files
35
+ A list of input file paths to be concatenated and split.
36
+ output_dir
37
+ The directory where the resulting chunks will be saved.
38
+ desired_chunk_size_mb
39
+ The desired size of each chunk, in megabytes.
40
+ """
41
+
42
+ # concatenate all input files
43
+ df = pd.DataFrame()
44
+ input_file_size_mb = 0
45
+ for file in input_files:
46
+ input_file_size_mb += os.path.getsize(file) / 1024**2
47
+ tmp = pd.read_parquet(file)
48
+ df = pd.concat([df, tmp], ignore_index=True)
49
+
50
+ # divide df into num_chunks and save each one out
51
+ num_chunks = math.ceil(input_file_size_mb / desired_chunk_size_mb)
52
+ chunk_size = math.ceil(len(df) / num_chunks)
53
+ if num_chunks == 1:
54
+ logger.info(f"Input data is already smaller than desired chunk size; not splitting")
55
+ else:
56
+ logger.info(
57
+ f"Splitting a {round(input_file_size_mb, 2)} MB dataset ({len(df)} rows) into "
58
+ f"into {num_chunks} chunks of size ~{desired_chunk_size_mb} MB each"
59
+ )
60
+ for i in range(num_chunks):
61
+ start = i * chunk_size
62
+ end = (i + 1) * chunk_size
63
+ chunk = df.iloc[start:end]
64
+ chunk_dir = os.path.join(output_dir, f"chunk_{i}")
65
+ if not os.path.exists(chunk_dir):
66
+ os.makedirs(chunk_dir)
67
+ logger.debug(
68
+ f"Writing out chunk {i+1}/{num_chunks} (rows {chunk.index[0]} to "
69
+ f"{chunk.index[-1]})"
70
+ )
71
+ chunk.to_parquet(os.path.join(chunk_dir, "result.parquet"))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: easylink
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -1,22 +1,22 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=n3oM6B_EMz93NsTI18NNZd-jKFcUPzUkbIKj5VFK5ok,22
4
- easylink/cli.py,sha256=O0YvFnjxM3N8ox6wMi5-SSQZ6bS4_VcRT3apCIY7u0I,6117
3
+ easylink/_version.py,sha256=YpKDcdV7CqL8n45u267wKtyloM13FSVbOdrqgNZnSLM,22
4
+ easylink/cli.py,sha256=ARSKAljepNOEYd1VCS_QqBJQIBLzE3IgKiOb5-OROdY,6380
5
5
  easylink/configuration.py,sha256=Ire2pMZNZ6wtSwhcWnQpYa-snX4KrhXgovlQwQ2Wxf4,12530
6
- easylink/graph_components.py,sha256=LIHLrASrppTr9XHUFZymtT9rE7_cFzCB1LVxyO3pCWk,11554
7
- easylink/implementation.py,sha256=1TFbsUxOkDkyyMtCwYQtA6uwZb5TawEw4CHjjSot0_s,8873
6
+ easylink/graph_components.py,sha256=U6gbKjQVTBftdOGlH-oOKS6o0dyS88IL6MCpI_mhv3s,12354
7
+ easylink/implementation.py,sha256=AwGl5YCKCSQo91owWj-gg9_5lBz7H_4q2z7jF0BhXs4,8992
8
8
  easylink/implementation_metadata.yaml,sha256=VvlEu3Dvlmeh1MpzeYx91j22GiV-9mu3hZP5yVuW04o,6763
9
- easylink/pipeline.py,sha256=exnYtGqJX5GiLEI1h9dRe8K-sRT-6fBPufXdA8OeyPs,10350
10
- easylink/pipeline_graph.py,sha256=K8padPqQG6kSupYCQgbjYn_gs51tE1CmOvWmZ49EKLc,19222
9
+ easylink/pipeline.py,sha256=EyCXv5p9WzTqcndXK6ukBJE6jY_fWIP_DGZQUl1wRcY,12284
10
+ easylink/pipeline_graph.py,sha256=vsY6nW_iEwZCNf_N_3CsixsKBUy_5JxGEi61-1Q-KAw,22842
11
11
  easylink/pipeline_schema.py,sha256=ckvA4deRYalY5dLLbJDrO_pKttMuWnEUvSn5fSdu4jc,5900
12
- easylink/rule.py,sha256=dHr95tI4O39makPp9nEGFaIsGhOoa93RwuVzIVXUhak,7606
13
- easylink/runner.py,sha256=CSqYDWzY4pBvaklUUvj75UeJ4VxqwW9MYgcwGrAlspo,6222
14
- easylink/step.py,sha256=NCN1L5ojpfJ1CgV_Ih4duQL_aUFL3ri_XiNguH20JDE,62709
12
+ easylink/rule.py,sha256=W97LMI-vkEPipJbnSZLn2BxfYfFtvzGTKzq6YgDVri0,19913
13
+ easylink/runner.py,sha256=k9ICTToHj2xr6MGIuvlWf6YMeZ47UGgseaMByMgUGac,6271
14
+ easylink/step.py,sha256=tTlDbhtjd7vkKmsnq622WnwQgBAdTN1dapUJqhUlPjA,65664
15
15
  easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
16
16
  easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
17
- easylink/pipeline_schema_constants/__init__.py,sha256=gVg8_0xv4FxG3B-Jy5MwiXIVntu4m36uCdA3ohVTN7w,460
18
- easylink/pipeline_schema_constants/development.py,sha256=WaS6EQzgCI6opnY-7OCHpT0JpP2OvRmYrWVdt_ND_bk,10915
19
- easylink/pipeline_schema_constants/testing.py,sha256=GUJtVGIzLd71j7hX_qaSuMT3FCxwFOGzZwWc_1Tbtsk,7016
17
+ easylink/pipeline_schema_constants/__init__.py,sha256=RVUncdInRvafu10hbf7J9QQv7cE_pg3ylw_C0v1uIOY,684
18
+ easylink/pipeline_schema_constants/development.py,sha256=-F2xaht9u66oJsXyJ8-9Mnx0PoibVZNpTC6ReRykD9w,11280
19
+ easylink/pipeline_schema_constants/testing.py,sha256=icg7Vx0t8Wnic_Bx8tGkYB5wlZmrBqHYeQSRc9mR0Lo,10704
20
20
  easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
21
21
  easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
22
22
  easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
@@ -36,13 +36,15 @@ easylink/steps/dev/r/README.md,sha256=dPjZdDTqcJsZCiwhddzlOj1ob0P7YocZUNFrLIGM1-
36
36
  easylink/steps/dev/r/dummy_step.R,sha256=1TWZY8CEkT6gavrulBxFsKbDSKJJjk0NtJrGH7TIikE,4975
37
37
  easylink/steps/dev/r/r-image.def,sha256=LrhXlt0C3k7d_VJWopRPEVARnFWSuq_oILlwo7g03bE,627
38
38
  easylink/utilities/__init__.py,sha256=EBk0rvRPZqwqzIqdVo8jkpSiZFFnj_fHaRB-P6EuCmk,59
39
- easylink/utilities/data_utils.py,sha256=13z-hbI7BcEFzjI1Ns-s1ddvKRRKynHxPwrA7KQ90po,1575
40
- easylink/utilities/general_utils.py,sha256=PSlTiHmafuI_dDSH0cW7iSCkTV0WgqX9ZgzlT6kyy_E,3271
39
+ easylink/utilities/aggregator_utils.py,sha256=7-zg7znkUow3SuwbqOAX1H0j2GzqU1RjLs2B-hiQWls,971
40
+ easylink/utilities/data_utils.py,sha256=D1Srj_2Ol5mAVt_D8k8fpm5E-GU8s6k1uKLatcQ1Oeo,1597
41
+ easylink/utilities/general_utils.py,sha256=IM78EToICkmkZX1pvYsU6uZnVvXYDmS26H9Tjmg0XCM,3293
41
42
  easylink/utilities/paths.py,sha256=yl0cuWChJmB6YKMCQavTKw9jIl-VQhH6cnsM6D5c0Zk,599
42
43
  easylink/utilities/spark.smk,sha256=tQ7RArNQzhjbaBQQcRORB4IxxkuDx4gPHUBcWHDYJ_U,5795
44
+ easylink/utilities/splitter_utils.py,sha256=riz3rflTrbkQ8uqMaqmXCY1BaWvgdxGzl8WN7Lb7eO8,2601
43
45
  easylink/utilities/validation_utils.py,sha256=qOgn1n3_m5blFN7eHJ9MbOt5DkFA6DWucAOUAjvGvco,764
44
- easylink-0.1.6.dist-info/METADATA,sha256=Jbmi6F9PaLYIOqs63ViVjqaO6-3StEA7d_PE5N1UYik,2804
45
- easylink-0.1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
46
- easylink-0.1.6.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
47
- easylink-0.1.6.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
48
- easylink-0.1.6.dist-info/RECORD,,
46
+ easylink-0.1.7.dist-info/METADATA,sha256=tNiHPs5mHZUjAYO4hHpBS8k-26MgUZZe_g6W6GXuVV8,2804
47
+ easylink-0.1.7.dist-info/WHEEL,sha256=nn6H5-ilmfVryoAQl3ZQ2l8SH5imPWFpm1A5FgEuFV4,91
48
+ easylink-0.1.7.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
49
+ easylink-0.1.7.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
50
+ easylink-0.1.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5