easylink 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easylink/_version.py +1 -1
- easylink/configuration.py +5 -5
- easylink/graph_components.py +48 -51
- easylink/implementation.py +70 -10
- easylink/pipeline.py +127 -24
- easylink/pipeline_graph.py +46 -26
- easylink/pipeline_schema_constants/__init__.py +11 -7
- easylink/pipeline_schema_constants/development.py +2 -23
- easylink/pipeline_schema_constants/testing.py +243 -17
- easylink/rule.py +60 -140
- easylink/runner.py +14 -9
- easylink/step.py +397 -143
- easylink/utilities/spark.smk +2 -2
- easylink/utilities/splitter_utils.py +35 -0
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/METADATA +22 -14
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/RECORD +19 -19
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/WHEEL +1 -1
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/entry_points.txt +0 -0
- {easylink-0.1.13.dist-info → easylink-0.1.15.dist-info}/top_level.txt +0 -0
easylink/utilities/spark.smk
CHANGED
@@ -70,7 +70,7 @@ rule wait_for_spark_master:
|
|
70
70
|
while true; do
|
71
71
|
|
72
72
|
if [[ -e {params.spark_master_log_file} ]]; then
|
73
|
-
found=`grep -o "
|
73
|
+
found=`grep -o "\\(spark://.*$\\)" {params.spark_master_log_file} || true`
|
74
74
|
|
75
75
|
if [[ ! -z $found ]]; then
|
76
76
|
echo "Spark master URL found: $found"
|
@@ -178,7 +178,7 @@ rule wait_for_spark_worker:
|
|
178
178
|
while true; do
|
179
179
|
|
180
180
|
if [[ -e {params.spark_worker_log_file} ]]; then
|
181
|
-
found=`grep -o "
|
181
|
+
found=`grep -o "\\(Worker: Successfully registered with master $MASTER_URL\\)" {params.spark_worker_log_file} || true`
|
182
182
|
|
183
183
|
if [[ ! -z $found ]]; then
|
184
184
|
echo "Spark Worker {wildcards.scatteritem} registered successfully"
|
@@ -70,3 +70,38 @@ def split_data_by_size(
|
|
70
70
|
f"{chunk.index[-1]})"
|
71
71
|
)
|
72
72
|
chunk.to_parquet(os.path.join(chunk_dir, "result.parquet"))
|
73
|
+
|
74
|
+
|
75
|
+
def split_data_in_two(input_files: list[str], output_dir: str, *args, **kwargs) -> None:
|
76
|
+
"""Splits the data (from a single input slot) into two chunks of equal.
|
77
|
+
|
78
|
+
This function takes all datasets from a single input slot, concatenates them,
|
79
|
+
and then splits the resulting dataset into two chunks of similar size.
|
80
|
+
|
81
|
+
Parameters
|
82
|
+
----------
|
83
|
+
input_files
|
84
|
+
A list of input file paths to be concatenated and split.
|
85
|
+
output_dir
|
86
|
+
The directory where the resulting chunks will be saved.
|
87
|
+
desired_chunk_size_mb
|
88
|
+
The desired size of each chunk, in megabytes.
|
89
|
+
"""
|
90
|
+
|
91
|
+
# concatenate all input files
|
92
|
+
df = pd.DataFrame()
|
93
|
+
for file in input_files:
|
94
|
+
tmp = pd.read_parquet(file)
|
95
|
+
df = pd.concat([df, tmp], ignore_index=True)
|
96
|
+
|
97
|
+
# divide df into two and save each chunk out
|
98
|
+
num_chunks = 2
|
99
|
+
chunk_size = math.ceil(len(df) / num_chunks)
|
100
|
+
for i in range(num_chunks):
|
101
|
+
start = i * chunk_size
|
102
|
+
end = (i + 1) * chunk_size
|
103
|
+
chunk = df.iloc[start:end]
|
104
|
+
chunk_dir = os.path.join(output_dir, f"chunk_{i}")
|
105
|
+
if not os.path.exists(chunk_dir):
|
106
|
+
os.makedirs(chunk_dir)
|
107
|
+
chunk.to_parquet(os.path.join(chunk_dir, "result.parquet"))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: easylink
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.15
|
4
4
|
Summary: Research repository for the EasyLink ER ecosystem project.
|
5
5
|
Home-page: https://github.com/ihmeuw/easylink
|
6
6
|
Author: The EasyLink developers
|
@@ -19,11 +19,14 @@ Requires-Dist: pyarrow
|
|
19
19
|
Requires-Dist: snakemake>=8.0.0
|
20
20
|
Requires-Dist: snakemake-interface-executor-plugins<9.0.0
|
21
21
|
Requires-Dist: snakemake-executor-plugin-slurm
|
22
|
+
Requires-Dist: pandas-stubs
|
23
|
+
Requires-Dist: pyarrow-stubs
|
22
24
|
Provides-Extra: docs
|
23
25
|
Requires-Dist: sphinx<8.2.0; extra == "docs"
|
24
26
|
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
25
27
|
Requires-Dist: sphinx-autodoc-typehints; extra == "docs"
|
26
28
|
Requires-Dist: sphinx-click; extra == "docs"
|
29
|
+
Requires-Dist: sphinx-autobuild; extra == "docs"
|
27
30
|
Requires-Dist: typing_extensions; extra == "docs"
|
28
31
|
Provides-Extra: test
|
29
32
|
Requires-Dist: pytest; extra == "test"
|
@@ -34,6 +37,7 @@ Requires-Dist: sphinx<8.2.0; extra == "dev"
|
|
34
37
|
Requires-Dist: sphinx-rtd-theme; extra == "dev"
|
35
38
|
Requires-Dist: sphinx-autodoc-typehints; extra == "dev"
|
36
39
|
Requires-Dist: sphinx-click; extra == "dev"
|
40
|
+
Requires-Dist: sphinx-autobuild; extra == "dev"
|
37
41
|
Requires-Dist: typing_extensions; extra == "dev"
|
38
42
|
Requires-Dist: pytest; extra == "dev"
|
39
43
|
Requires-Dist: pytest-cov; extra == "dev"
|
@@ -71,28 +75,32 @@ Installation
|
|
71
75
|
|
72
76
|
There are a few things to install in order to use this package:
|
73
77
|
|
74
|
-
- Install singularity.
|
75
|
-
likely need to request it from your system admin.
|
76
|
-
Refer to https://docs.sylabs.io/guides/4.1/admin-guide/installation.html
|
78
|
+
- Install singularity.
|
77
79
|
|
80
|
+
You may need to request it from your system admin.
|
81
|
+
Refer to https://docs.sylabs.io/guides/4.1/admin-guide/installation.html.
|
82
|
+
You can check if you already have singularity installed by running the command ``singularity --version``. For an
|
83
|
+
existing installation, your singularity version number is printed.
|
78
84
|
|
79
|
-
- Install
|
85
|
+
- Install conda.
|
86
|
+
|
87
|
+
We recommend `miniforge <https://github.com/conda-forge/miniforge>`_. You can check if you already
|
88
|
+
have conda installed by running the command ``conda --version``. For an existing installation, a version
|
89
|
+
will be displayed.
|
80
90
|
|
81
|
-
|
82
|
-
|
83
|
-
$ conda install graphviz
|
84
|
-
|
85
|
-
- Install EasyLink.
|
91
|
+
- Install easylink, python and graphviz in a conda environment.
|
86
92
|
|
87
93
|
Option 1 - Install from PyPI with pip::
|
88
94
|
|
95
|
+
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
96
|
+
$ conda activate easylink
|
89
97
|
$ pip install easylink
|
90
98
|
|
91
99
|
Option 2 - Build from source with pip::
|
92
|
-
|
93
|
-
$
|
94
|
-
$
|
95
|
-
$ pip install .
|
100
|
+
|
101
|
+
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
102
|
+
$ conda activate easylink
|
103
|
+
$ pip install git+https://github.com/ihmeuw/easylink.git
|
96
104
|
|
97
105
|
.. _end_installation:
|
98
106
|
|
@@ -1,22 +1,22 @@
|
|
1
1
|
easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
|
2
2
|
easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
|
3
|
-
easylink/_version.py,sha256=
|
3
|
+
easylink/_version.py,sha256=qb0TalpSt1CbprnFyeLUKqgrqNtmnk9IoQQ7umAoXVY,23
|
4
4
|
easylink/cli.py,sha256=ARSKAljepNOEYd1VCS_QqBJQIBLzE3IgKiOb5-OROdY,6380
|
5
|
-
easylink/configuration.py,sha256=
|
6
|
-
easylink/graph_components.py,sha256=
|
7
|
-
easylink/implementation.py,sha256=
|
5
|
+
easylink/configuration.py,sha256=lfm8ViUpr1-O-EovTjKZbAlIht2EBv3RndN1mzYbmDE,12565
|
6
|
+
easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
|
7
|
+
easylink/implementation.py,sha256=4u3QgLOrNttfU9Kd_9u_lg3in4ePoYUfO9u_udwiuh0,10878
|
8
8
|
easylink/implementation_metadata.yaml,sha256=VvlEu3Dvlmeh1MpzeYx91j22GiV-9mu3hZP5yVuW04o,6763
|
9
|
-
easylink/pipeline.py,sha256=
|
10
|
-
easylink/pipeline_graph.py,sha256=
|
9
|
+
easylink/pipeline.py,sha256=5KOYH5HyJjVlFoBRKGLs2hn5mpC3tPYG_ux3T1qSV9k,17504
|
10
|
+
easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
|
11
11
|
easylink/pipeline_schema.py,sha256=Q2sCpsC-F2W0yxVP7ufunowDepOBrRVENXOdap9J5iY,6921
|
12
|
-
easylink/rule.py,sha256=
|
13
|
-
easylink/runner.py,sha256=
|
14
|
-
easylink/step.py,sha256=
|
12
|
+
easylink/rule.py,sha256=uoPj7yFFqiwvxlnhoejrZuPR3YX--y1k02uDDz3viTc,16196
|
13
|
+
easylink/runner.py,sha256=cbCo5_NvvulmjjAaBCG6qCmbtJiHK-7NuDvbngdU_PY,6675
|
14
|
+
easylink/step.py,sha256=u1AMPrYGNVb3ZH6uB_U0dUeJvOeQ2MoVHdlC8k63AA8,85226
|
15
15
|
easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
|
16
16
|
easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
|
17
|
-
easylink/pipeline_schema_constants/__init__.py,sha256=
|
18
|
-
easylink/pipeline_schema_constants/development.py,sha256=
|
19
|
-
easylink/pipeline_schema_constants/testing.py,sha256=
|
17
|
+
easylink/pipeline_schema_constants/__init__.py,sha256=HbN-NytoGuk8aTfe0Wal232UnLopFBQGe2uRjmg_igQ,1272
|
18
|
+
easylink/pipeline_schema_constants/development.py,sha256=yRzkCiBqF_Jv3Y0GNvswVAWeZfKJRXk8Y8Q9ZhwCg_A,11596
|
19
|
+
easylink/pipeline_schema_constants/testing.py,sha256=8vVGj7opZ9Uzj7EHGMbgXyZj3_SboIeUPB0XlZkmvrM,18901
|
20
20
|
easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
|
21
21
|
easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
|
22
22
|
easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
|
@@ -40,11 +40,11 @@ easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl
|
|
40
40
|
easylink/utilities/data_utils.py,sha256=CcnM3u0_MQDQo3jMs3E4IK_rz8wAsFdJ674fZxYEFZg,4620
|
41
41
|
easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
|
42
42
|
easylink/utilities/paths.py,sha256=KM1GlnsAcKbUJrC4LZKpeJfPljxe_aXP1ZhVp43TYRA,924
|
43
|
-
easylink/utilities/spark.smk,sha256=
|
44
|
-
easylink/utilities/splitter_utils.py,sha256=
|
43
|
+
easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
|
44
|
+
easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
|
45
45
|
easylink/utilities/validation_utils.py,sha256=W9r_RXcivJjfpioLhONirfwdByYttxNsVY489_sbrYQ,1683
|
46
|
-
easylink-0.1.
|
47
|
-
easylink-0.1.
|
48
|
-
easylink-0.1.
|
49
|
-
easylink-0.1.
|
50
|
-
easylink-0.1.
|
46
|
+
easylink-0.1.15.dist-info/METADATA,sha256=wV8zQKO5KUtsvJIaAv7wi_I1Nich7hqjfuKNzQLM-o4,3449
|
47
|
+
easylink-0.1.15.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
48
|
+
easylink-0.1.15.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
|
49
|
+
easylink-0.1.15.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
|
50
|
+
easylink-0.1.15.dist-info/RECORD,,
|
File without changes
|
File without changes
|