easylink 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,7 +70,7 @@ rule wait_for_spark_master:
70
70
  while true; do
71
71
 
72
72
  if [[ -e {params.spark_master_log_file} ]]; then
73
- found=`grep -o "\(spark://.*$\)" {params.spark_master_log_file} || true`
73
+ found=`grep -o "\\(spark://.*$\\)" {params.spark_master_log_file} || true`
74
74
 
75
75
  if [[ ! -z $found ]]; then
76
76
  echo "Spark master URL found: $found"
@@ -178,7 +178,7 @@ rule wait_for_spark_worker:
178
178
  while true; do
179
179
 
180
180
  if [[ -e {params.spark_worker_log_file} ]]; then
181
- found=`grep -o "\(Worker: Successfully registered with master $MASTER_URL\)" {params.spark_worker_log_file} || true`
181
+ found=`grep -o "\\(Worker: Successfully registered with master $MASTER_URL\\)" {params.spark_worker_log_file} || true`
182
182
 
183
183
  if [[ ! -z $found ]]; then
184
184
  echo "Spark Worker {wildcards.scatteritem} registered successfully"
@@ -70,3 +70,38 @@ def split_data_by_size(
70
70
  f"{chunk.index[-1]})"
71
71
  )
72
72
  chunk.to_parquet(os.path.join(chunk_dir, "result.parquet"))
73
+
74
+
75
+ def split_data_in_two(input_files: list[str], output_dir: str, *args, **kwargs) -> None:
76
+ """Splits the data (from a single input slot) into two chunks of equal.
77
+
78
+ This function takes all datasets from a single input slot, concatenates them,
79
+ and then splits the resulting dataset into two chunks of similar size.
80
+
81
+ Parameters
82
+ ----------
83
+ input_files
84
+ A list of input file paths to be concatenated and split.
85
+ output_dir
86
+ The directory where the resulting chunks will be saved.
87
+ desired_chunk_size_mb
88
+ The desired size of each chunk, in megabytes.
89
+ """
90
+
91
+ # concatenate all input files
92
+ df = pd.DataFrame()
93
+ for file in input_files:
94
+ tmp = pd.read_parquet(file)
95
+ df = pd.concat([df, tmp], ignore_index=True)
96
+
97
+ # divide df into two and save each chunk out
98
+ num_chunks = 2
99
+ chunk_size = math.ceil(len(df) / num_chunks)
100
+ for i in range(num_chunks):
101
+ start = i * chunk_size
102
+ end = (i + 1) * chunk_size
103
+ chunk = df.iloc[start:end]
104
+ chunk_dir = os.path.join(output_dir, f"chunk_{i}")
105
+ if not os.path.exists(chunk_dir):
106
+ os.makedirs(chunk_dir)
107
+ chunk.to_parquet(os.path.join(chunk_dir, "result.parquet"))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: easylink
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: Research repository for the EasyLink ER ecosystem project.
5
5
  Home-page: https://github.com/ihmeuw/easylink
6
6
  Author: The EasyLink developers
@@ -19,11 +19,14 @@ Requires-Dist: pyarrow
19
19
  Requires-Dist: snakemake>=8.0.0
20
20
  Requires-Dist: snakemake-interface-executor-plugins<9.0.0
21
21
  Requires-Dist: snakemake-executor-plugin-slurm
22
+ Requires-Dist: pandas-stubs
23
+ Requires-Dist: pyarrow-stubs
22
24
  Provides-Extra: docs
23
25
  Requires-Dist: sphinx<8.2.0; extra == "docs"
24
26
  Requires-Dist: sphinx-rtd-theme; extra == "docs"
25
27
  Requires-Dist: sphinx-autodoc-typehints; extra == "docs"
26
28
  Requires-Dist: sphinx-click; extra == "docs"
29
+ Requires-Dist: sphinx-autobuild; extra == "docs"
27
30
  Requires-Dist: typing_extensions; extra == "docs"
28
31
  Provides-Extra: test
29
32
  Requires-Dist: pytest; extra == "test"
@@ -34,6 +37,7 @@ Requires-Dist: sphinx<8.2.0; extra == "dev"
34
37
  Requires-Dist: sphinx-rtd-theme; extra == "dev"
35
38
  Requires-Dist: sphinx-autodoc-typehints; extra == "dev"
36
39
  Requires-Dist: sphinx-click; extra == "dev"
40
+ Requires-Dist: sphinx-autobuild; extra == "dev"
37
41
  Requires-Dist: typing_extensions; extra == "dev"
38
42
  Requires-Dist: pytest; extra == "dev"
39
43
  Requires-Dist: pytest-cov; extra == "dev"
@@ -71,28 +75,32 @@ Installation
71
75
 
72
76
  There are a few things to install in order to use this package:
73
77
 
74
- - Install singularity. If this is not already installed on your system, you will
75
- likely need to request it from your system admin.
76
- Refer to https://docs.sylabs.io/guides/4.1/admin-guide/installation.html
78
+ - Install singularity.
77
79
 
80
+ You may need to request it from your system admin.
81
+ Refer to https://docs.sylabs.io/guides/4.1/admin-guide/installation.html.
82
+ You can check if you already have singularity installed by running the command ``singularity --version``. For an
83
+ existing installation, your singularity version number is printed.
78
84
 
79
- - Install graphviz via:
85
+ - Install conda.
86
+
87
+ We recommend `miniforge <https://github.com/conda-forge/miniforge>`_. You can check if you already
88
+ have conda installed by running the command ``conda --version``. For an existing installation, a version
89
+ will be displayed.
80
90
 
81
- .. code-block:: console
82
-
83
- $ conda install graphviz
84
-
85
- - Install EasyLink.
91
+ - Install easylink, python and graphviz in a conda environment.
86
92
 
87
93
  Option 1 - Install from PyPI with pip::
88
94
 
95
+ $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
96
+ $ conda activate easylink
89
97
  $ pip install easylink
90
98
 
91
99
  Option 2 - Build from source with pip::
92
-
93
- $ git clone git@github.com:ihmeuw/easylink.git # or git clone https://github.com/ihmeuw/easylink.git
94
- $ cd easylink
95
- $ pip install .
100
+
101
+ $ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
102
+ $ conda activate easylink
103
+ $ pip install git+https://github.com/ihmeuw/easylink.git
96
104
 
97
105
  .. _end_installation:
98
106
 
@@ -1,22 +1,22 @@
1
1
  easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
2
2
  easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
3
- easylink/_version.py,sha256=khDKUuWafURKVs5EAZkpOMiUHI2-V7axlqrWLPUpuZo,23
3
+ easylink/_version.py,sha256=qb0TalpSt1CbprnFyeLUKqgrqNtmnk9IoQQ7umAoXVY,23
4
4
  easylink/cli.py,sha256=ARSKAljepNOEYd1VCS_QqBJQIBLzE3IgKiOb5-OROdY,6380
5
- easylink/configuration.py,sha256=Ire2pMZNZ6wtSwhcWnQpYa-snX4KrhXgovlQwQ2Wxf4,12530
6
- easylink/graph_components.py,sha256=PhMKxpgZjorhubS7vcta1pgXgXSGplmPulQpV0YZhqo,14811
7
- easylink/implementation.py,sha256=AwGl5YCKCSQo91owWj-gg9_5lBz7H_4q2z7jF0BhXs4,8992
5
+ easylink/configuration.py,sha256=lfm8ViUpr1-O-EovTjKZbAlIht2EBv3RndN1mzYbmDE,12565
6
+ easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
7
+ easylink/implementation.py,sha256=4u3QgLOrNttfU9Kd_9u_lg3in4ePoYUfO9u_udwiuh0,10878
8
8
  easylink/implementation_metadata.yaml,sha256=VvlEu3Dvlmeh1MpzeYx91j22GiV-9mu3hZP5yVuW04o,6763
9
- easylink/pipeline.py,sha256=EyCXv5p9WzTqcndXK6ukBJE6jY_fWIP_DGZQUl1wRcY,12284
10
- easylink/pipeline_graph.py,sha256=vsY6nW_iEwZCNf_N_3CsixsKBUy_5JxGEi61-1Q-KAw,22842
9
+ easylink/pipeline.py,sha256=5KOYH5HyJjVlFoBRKGLs2hn5mpC3tPYG_ux3T1qSV9k,17504
10
+ easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
11
11
  easylink/pipeline_schema.py,sha256=Q2sCpsC-F2W0yxVP7ufunowDepOBrRVENXOdap9J5iY,6921
12
- easylink/rule.py,sha256=W97LMI-vkEPipJbnSZLn2BxfYfFtvzGTKzq6YgDVri0,19913
13
- easylink/runner.py,sha256=k9ICTToHj2xr6MGIuvlWf6YMeZ47UGgseaMByMgUGac,6271
14
- easylink/step.py,sha256=Hweg1OAGcmrNAt95C-M4ksOAtc_db0oeibbF3cnqhq0,74951
12
+ easylink/rule.py,sha256=uoPj7yFFqiwvxlnhoejrZuPR3YX--y1k02uDDz3viTc,16196
13
+ easylink/runner.py,sha256=cbCo5_NvvulmjjAaBCG6qCmbtJiHK-7NuDvbngdU_PY,6675
14
+ easylink/step.py,sha256=u1AMPrYGNVb3ZH6uB_U0dUeJvOeQ2MoVHdlC8k63AA8,85226
15
15
  easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
16
16
  easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
17
- easylink/pipeline_schema_constants/__init__.py,sha256=uRVjQw7_Ff5IBQw0_Jc93Fzfa-MnbPVPKsy18CCaW7E,1021
18
- easylink/pipeline_schema_constants/development.py,sha256=0fc6xWRBr5e_xDaldR9sY2vMQJU1wnlhDQS_-yUOT6g,12339
19
- easylink/pipeline_schema_constants/testing.py,sha256=ohcTlT_viZYxS1GkO46mjkb8IzXo6yIOqvBbb4YrOhA,10897
17
+ easylink/pipeline_schema_constants/__init__.py,sha256=HbN-NytoGuk8aTfe0Wal232UnLopFBQGe2uRjmg_igQ,1272
18
+ easylink/pipeline_schema_constants/development.py,sha256=yRzkCiBqF_Jv3Y0GNvswVAWeZfKJRXk8Y8Q9ZhwCg_A,11596
19
+ easylink/pipeline_schema_constants/testing.py,sha256=8vVGj7opZ9Uzj7EHGMbgXyZj3_SboIeUPB0XlZkmvrM,18901
20
20
  easylink/steps/dev/README.md,sha256=u9dZUggpY2Lf2qb-xkDLWWgHjcmi4osbQtzSNo4uklE,4549
21
21
  easylink/steps/dev/build-containers-local.sh,sha256=Wy3pfcyt7I-BNvHcr7ZXDe0g5Ihd00BIPqt9YuRbLeA,259
22
22
  easylink/steps/dev/build-containers-remote.sh,sha256=Hy-kaaXf-ta6n8SzOz_ahByjMY5T7J71MvzXRXDvQw8,271
@@ -40,11 +40,11 @@ easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl
40
40
  easylink/utilities/data_utils.py,sha256=CcnM3u0_MQDQo3jMs3E4IK_rz8wAsFdJ674fZxYEFZg,4620
41
41
  easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
42
42
  easylink/utilities/paths.py,sha256=KM1GlnsAcKbUJrC4LZKpeJfPljxe_aXP1ZhVp43TYRA,924
43
- easylink/utilities/spark.smk,sha256=tQ7RArNQzhjbaBQQcRORB4IxxkuDx4gPHUBcWHDYJ_U,5795
44
- easylink/utilities/splitter_utils.py,sha256=y4CbbTBgRaoXFxy-9Eu5eWx4lA4ZEcbrYpxgLIzG_kc,2602
43
+ easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
44
+ easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
45
45
  easylink/utilities/validation_utils.py,sha256=W9r_RXcivJjfpioLhONirfwdByYttxNsVY489_sbrYQ,1683
46
- easylink-0.1.13.dist-info/METADATA,sha256=ooL68LseA1cN5X2wLOB_uIFgXIjW0PCINie7aMHw6t0,2805
47
- easylink-0.1.13.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
48
- easylink-0.1.13.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
49
- easylink-0.1.13.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
50
- easylink-0.1.13.dist-info/RECORD,,
46
+ easylink-0.1.15.dist-info/METADATA,sha256=wV8zQKO5KUtsvJIaAv7wi_I1Nich7hqjfuKNzQLM-o4,3449
47
+ easylink-0.1.15.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
48
+ easylink-0.1.15.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
49
+ easylink-0.1.15.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
50
+ easylink-0.1.15.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (79.0.0)
2
+ Generator: setuptools (80.3.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5