easylink 0.1.23__tar.gz → 0.1.25__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {easylink-0.1.23 → easylink-0.1.25}/CHANGELOG.rst +9 -0
- {easylink-0.1.23 → easylink-0.1.25}/PKG-INFO +27 -11
- easylink-0.1.25/README.rst +70 -0
- easylink-0.1.25/docs/source/user_guide/tutorials/input_data_demo.yaml +3 -0
- easylink-0.1.23/docs/source/user_guide/tutorials/pipeline_demo_naive.yaml → easylink-0.1.25/docs/source/user_guide/tutorials/pipeline_demo_improved.yaml +6 -7
- easylink-0.1.25/docs/source/user_guide/tutorials/pipeline_demo_naive.yaml +70 -0
- easylink-0.1.25/src/easylink/_version.py +1 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/cli.py +15 -10
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/implementation_metadata.yaml +70 -44
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/rule.py +2 -1
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/runner.py +118 -1
- easylink-0.1.25/src/easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/splink/splink_evaluating_pairs.py +2 -1
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/utilities/general_utils.py +18 -8
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink.egg-info/PKG-INFO +27 -11
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink.egg-info/SOURCES.txt +3 -1
- easylink-0.1.23/tests/e2e/test_easylink_run.py → easylink-0.1.25/tests/e2e/test_easylink_cli.py +23 -0
- easylink-0.1.25/tests/specifications/common/environment_local.yaml +2 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/e2e/pipeline_cascade.yaml +11 -11
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/e2e/pipeline_splink_dummy.yaml +6 -6
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/e2e/pipeline_with_fastLink.yaml +5 -5
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_rule.py +98 -0
- easylink-0.1.25/tests/unit/test_runner.py +116 -0
- easylink-0.1.23/README.rst +0 -54
- easylink-0.1.23/docs/source/user_guide/tutorials/input_data_demo.yaml +0 -3
- easylink-0.1.23/docs/source/user_guide/tutorials/pipeline_demo_improved.yaml +0 -71
- easylink-0.1.23/src/easylink/_version.py +0 -1
- easylink-0.1.23/tests/unit/test_runner.py +0 -50
- {easylink-0.1.23 → easylink-0.1.25}/.bandit +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/.flake8 +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/.github/CODEOWNERS +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/.github/pull_request_template.md +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/.github/workflows/deploy.yml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/.github/workflows/update_readme.yml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/.gitignore +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/.readthedocs.yml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/CONTRIBUTING.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/Jenkinsfile +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/LICENSE +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/Makefile +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/Makefile +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/nitpick-exceptions +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/_static/style.css +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/_templates/layout.html +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/cli.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/configuration.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/graph_components.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/implementation.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/index.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/pipeline.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/pipeline_graph.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/pipeline_schema.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/pipeline_schema_constants/development.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/pipeline_schema_constants/index.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/pipeline_schema_constants/testing.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/rule.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/runner.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/step.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/utilities/aggregator_utils.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/utilities/data_utils.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/utilities/general_utils.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/utilities/index.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/utilities/paths.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/utilities/splitter_utils.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/api_reference/utilities/validation_utils.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/index.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/01_step.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/02_default_implementation.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/03_slots.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/04_data_dependency.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/05_pipeline_schema.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/06_default_input.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/07_cloneable_section.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/08_cloneable_section_expanded.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/09_loopable_section.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/10_loopable_section_expanded.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/11_cloneable_section_splitter.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/12_cloneable_section_splitter_expanded.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/13_autoparallel_section.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/14_choice_section.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/15_choice_section_expanded.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/16_step_hierarchy.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/18_schema_to_pipeline.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/19_schema_to_pipeline_combined.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/clustering_sub_steps.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/easylink_pipeline_schema.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/entity_resolution_sub_steps.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/images/linking_sub_steps.drawio.png +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/pipeline_schema/index.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/concepts/workarounds.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/conf.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/glossary.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/index.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/cli.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/index.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/2020/input_file_ssa.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/2020/input_file_w2.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/2030/input_file_ssa.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/2030/input_file_w2.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/DAG-common-pipeline.svg +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/DAG-e2e-pipeline-expanded.svg +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/DAG-e2e-pipeline.svg +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/DAG-r-pyspark.svg +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/create_inputs_demo.ipynb +0 -0
- {easylink-0.1.23/tests/specifications/common → easylink-0.1.25/docs/source/user_guide/tutorials}/environment_local.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/environment_slurm.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/getting_started.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/impl-config-pipeline.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/index.rst +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/input_data.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/input_data_demo_2030.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/input_file_1.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/input_file_2.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/input_file_3.parquet +0 -0
- {easylink-0.1.23/src/easylink/steps/rl-dummy/input_data → easylink-0.1.25/docs/source/user_guide/tutorials}/known_clusters.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/print_fp_fn_w2_ssa.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/print_metrics_w2_ssa.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/docs/source/user_guide/tutorials/r_spark_pipeline.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/pyproject.toml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/python_versions.json +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/pytype.cfg +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/setup.cfg +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/setup.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/__about__.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/__init__.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/configuration.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/devtools/implementation_creator.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/graph_components.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/implementation.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/pipeline.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/pipeline_graph.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/pipeline_schema.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/pipeline_schema_constants/__init__.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/pipeline_schema_constants/development.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/pipeline_schema_constants/main.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/pipeline_schema_constants/testing.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/step.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/cascading/exclude_clustered.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/cascading/exclude_clustered.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/cascading/exclude_none.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/cascading/exclude_none.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/cascading/update_clusters_by_connected_components.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/cascading/update_clusters_by_connected_components.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_clusters_to_links.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_clusters_to_links.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_determining_exclusions.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_determining_exclusions.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_removing_records.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_removing_records.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_schema_alignment.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_schema_alignment.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_updating_clusters.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/default/default_updating_clusters.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/README.md +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/build-containers-local.sh +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/build-containers-remote.sh +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/input_data/create_input_files.ipynb +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/input_data/input_file_1.csv +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/input_data/input_file_1.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/input_data/input_file_2.csv +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/input_data/input_file_2.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/python_pandas/README.md +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/python_pandas/dummy_step.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/python_pandas/python_pandas.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/python_pyspark/README.md +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/python_pyspark/dummy_step.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/python_pyspark/python_pyspark.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/r/README.md +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/r/dummy_step.R +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/r/r-image.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/dev/test.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/example/middle_name_to_initial.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/example/middle_name_to_initial.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/fastLink/fastLink_evaluating_pairs.R +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/fastLink/fastLink_evaluating_pairs.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/fastLink/fastLink_links_to_clusters.R +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/fastLink/fastLink_links_to_clusters.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/output_dir/dummy_step_1_for_output_dir_example.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/output_dir/dummy_step_1_for_output_dir_example.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/output_dir/dummy_step_2_for_output_dir_example.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/output_dir/dummy_step_2_for_output_dir_example.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/rl-dummy/input_data/create_input_files.ipynb +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/splink/splink_blocking_and_filtering.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/splink/splink_blocking_and_filtering.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/splink/splink_evaluating_pairs.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/splink/splink_links_to_clusters.def +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/steps/splink/splink_links_to_clusters.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/utilities/__init__.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/utilities/aggregator_utils.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/utilities/data_utils.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/utilities/paths.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/utilities/spark.smk +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/utilities/splitter_utils.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink/utilities/validation_utils.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink.egg-info/dependency_links.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink.egg-info/entry_points.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink.egg-info/not-zip-safe +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink.egg-info/requires.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/src/easylink.egg-info/top_level.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/__init__.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/conftest.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/e2e/pipeline_improved_results.csv +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/e2e/pipeline_improved_results_2030.csv +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/e2e/pipeline_naive_results.csv +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/e2e/pipeline_splink_dummy_results.csv +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/e2e/test_pipelines_main_schema.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/e2e/test_step_types.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/integration/test_compositions.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/integration/test_data_utils.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/integration/test_snakemake.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/integration/test_snakemake_slurm.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/integration/test_snakemake_spark.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/common/input_data.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/common/input_data_one_file.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/common/pipeline.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/e2e/environment_slurm.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/e2e/environment_slurm_4GB.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/e2e/input_data_dummy.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/e2e/pipeline.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/e2e/pipeline_expanded.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/integration/auto_parallel/pipeline_cloneable_step.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/integration/auto_parallel/pipeline_hierarchical_step.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/integration/auto_parallel/pipeline_loop_step.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/integration/environment_spark_slurm.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/integration/pipeline.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/integration/pipeline_output_dir.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/integration/pipeline_output_dir_default.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/integration/pipeline_spark.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/environment_minimum.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/environment_spark_slurm.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_bad_combined_implementations.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_bad_implementation.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_bad_loop_formatting.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_bad_step.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_bad_type_key.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_combine_bad_implementation_names.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_combine_bad_topology.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_combine_two_steps.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_combine_with_extra_node.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_combine_with_iteration.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_combine_with_iteration_cycle.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_combine_with_missing_node.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_combine_with_parallel.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_default_implementations.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_missing_implementation_name.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_missing_implementations.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_missing_loop_nodes.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_missing_step.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_missing_substeps.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_missing_type_key.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_nested_templated_steps.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_out_of_order.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_spark.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_type_config_mismatch.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/specifications/unit/pipeline_wrong_clone_keys.yaml +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/__init__.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/conftest.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/recipe_strings/python_pandas.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/rule_strings/aggregation_rule.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/rule_strings/auto_parallel_rule.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/rule_strings/checkpoint_rule.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/rule_strings/implemented_rule_local.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/rule_strings/implemented_rule_slurm.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/rule_strings/pipeline_local.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/rule_strings/pipeline_slurm.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/rule_strings/target_rule.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/rule_strings/validation_rule.txt +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_cli.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_config.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_data_utils.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_general_utils.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_graph_components.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_implementation.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_implementation_creator.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_pipeline.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_pipeline_graph.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_pipeline_schema.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_step.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/tests/unit/test_validations.py +0 -0
- {easylink-0.1.23 → easylink-0.1.25}/update_readme.py +0 -0
@@ -1,3 +1,12 @@
|
|
1
|
+
**0.1.25 - 6/30/25**
|
2
|
+
|
3
|
+
- Release new images
|
4
|
+
- Clean up stdout logging
|
5
|
+
|
6
|
+
**0.1.24 - 6/26/25**
|
7
|
+
|
8
|
+
- Properly escape special characters in envars specified via pipeline configuration
|
9
|
+
|
1
10
|
**0.1.23 - 6/25/25**
|
2
11
|
|
3
12
|
- Remove "dummy" pipeline references from codebase
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: easylink
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.25
|
4
4
|
Summary: Research repository for the EasyLink ER ecosystem project.
|
5
5
|
Home-page: https://github.com/ihmeuw/easylink
|
6
6
|
Author: The EasyLink developers
|
@@ -78,34 +78,50 @@ Installation
|
|
78
78
|
|
79
79
|
.. _installation:
|
80
80
|
|
81
|
+
**NOTE: This package requires AMD64 CPU architecture - it is not compatible with
|
82
|
+
Apple's ARM64 architecture (e.g. M1 and newer Macs).**
|
83
|
+
|
81
84
|
There are a few things to install in order to use this package:
|
82
85
|
|
83
|
-
-
|
86
|
+
- Set up Linux.
|
87
|
+
|
88
|
+
Singularity (and thus EasyLink) requires Linux to run. If you are not already
|
89
|
+
using Linux, you will need to set up a virtual machine; refer to the
|
90
|
+
`Singularity documentation for installing on Windows or Mac <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-windows-or-mac>`_.
|
84
91
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
``singularity --version``. For an existing installation, your
|
92
|
+
- Install Singularity.
|
93
|
+
|
94
|
+
First check if you already have Singularity installed by running the command
|
95
|
+
``singularity --version``. For an existing installation, your Singularity version
|
89
96
|
number is printed.
|
90
97
|
|
98
|
+
If Singularity is not yet installed, you will need to install it;
|
99
|
+
refer to the `Singularity docs for installing on Linux <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-linux>`_.
|
100
|
+
|
101
|
+
Note that this requires administrator privileges; you may need to request installation
|
102
|
+
from your system admin if you are working in a shared computing environment.
|
103
|
+
|
91
104
|
- Install conda.
|
92
105
|
|
93
106
|
We recommend `miniforge <https://github.com/conda-forge/miniforge>`_. You can
|
94
107
|
check if you already have conda installed by running the command ``conda --version``.
|
95
108
|
For an existing installation, a version will be displayed.
|
96
109
|
|
97
|
-
-
|
110
|
+
- Create a conda environment with python and graphviz installed.
|
111
|
+
|
112
|
+
::
|
113
|
+
|
114
|
+
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
115
|
+
$ conda activate easylink
|
116
|
+
|
117
|
+
- Install easylink in the environment.
|
98
118
|
|
99
119
|
Option 1 - Install from PyPI with pip::
|
100
120
|
|
101
|
-
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
102
|
-
$ conda activate easylink
|
103
121
|
$ pip install easylink
|
104
122
|
|
105
123
|
Option 2 - Build from source with pip::
|
106
124
|
|
107
|
-
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
108
|
-
$ conda activate easylink
|
109
125
|
$ pip install git+https://github.com/ihmeuw/easylink.git
|
110
126
|
|
111
127
|
.. _end_installation:
|
@@ -0,0 +1,70 @@
|
|
1
|
+
========
|
2
|
+
EasyLink
|
3
|
+
========
|
4
|
+
|
5
|
+
EasyLink is a framework that allows users to build and run highly configurable
|
6
|
+
entity resolution (ER) pipelines.
|
7
|
+
|
8
|
+
.. _python_support:
|
9
|
+
|
10
|
+
**Supported Python versions: 3.11, 3.12**
|
11
|
+
|
12
|
+
.. _end_python_support:
|
13
|
+
|
14
|
+
Installation
|
15
|
+
============
|
16
|
+
|
17
|
+
.. _installation:
|
18
|
+
|
19
|
+
**NOTE: This package requires AMD64 CPU architecture - it is not compatible with
|
20
|
+
Apple's ARM64 architecture (e.g. M1 and newer Macs).**
|
21
|
+
|
22
|
+
There are a few things to install in order to use this package:
|
23
|
+
|
24
|
+
- Set up Linux.
|
25
|
+
|
26
|
+
Singularity (and thus EasyLink) requires Linux to run. If you are not already
|
27
|
+
using Linux, you will need to set up a virtual machine; refer to the
|
28
|
+
`Singularity documentation for installing on Windows or Mac <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-windows-or-mac>`_.
|
29
|
+
|
30
|
+
- Install Singularity.
|
31
|
+
|
32
|
+
First check if you already have Singularity installed by running the command
|
33
|
+
``singularity --version``. For an existing installation, your Singularity version
|
34
|
+
number is printed.
|
35
|
+
|
36
|
+
If Singularity is not yet installed, you will need to install it;
|
37
|
+
refer to the `Singularity docs for installing on Linux <https://docs.sylabs.io/guides/4.1/admin-guide/installation.html#installation-on-linux>`_.
|
38
|
+
|
39
|
+
Note that this requires administrator privileges; you may need to request installation
|
40
|
+
from your system admin if you are working in a shared computing environment.
|
41
|
+
|
42
|
+
- Install conda.
|
43
|
+
|
44
|
+
We recommend `miniforge <https://github.com/conda-forge/miniforge>`_. You can
|
45
|
+
check if you already have conda installed by running the command ``conda --version``.
|
46
|
+
For an existing installation, a version will be displayed.
|
47
|
+
|
48
|
+
- Create a conda environment with python and graphviz installed.
|
49
|
+
|
50
|
+
::
|
51
|
+
|
52
|
+
$ conda create --name easylink -c conda-forge python=3.12 graphviz 'gcc<14' -y
|
53
|
+
$ conda activate easylink
|
54
|
+
|
55
|
+
- Install easylink in the environment.
|
56
|
+
|
57
|
+
Option 1 - Install from PyPI with pip::
|
58
|
+
|
59
|
+
$ pip install easylink
|
60
|
+
|
61
|
+
Option 2 - Build from source with pip::
|
62
|
+
|
63
|
+
$ pip install git+https://github.com/ihmeuw/easylink.git
|
64
|
+
|
65
|
+
.. _end_installation:
|
66
|
+
|
67
|
+
Documentation
|
68
|
+
=============
|
69
|
+
|
70
|
+
You can view documentation at https://easylink.readthedocs.io/en/latest/
|
@@ -37,7 +37,7 @@ steps:
|
|
37
37
|
configuration:
|
38
38
|
INPUT_DATASET: input_file_ssa
|
39
39
|
- implementation:
|
40
|
-
name:
|
40
|
+
name: no_pre-processing
|
41
41
|
configuration:
|
42
42
|
INPUT_DATASET: input_file_w2
|
43
43
|
schema_alignment:
|
@@ -47,17 +47,16 @@ steps:
|
|
47
47
|
implementation:
|
48
48
|
name: splink_blocking_and_filtering
|
49
49
|
configuration:
|
50
|
-
BLOCKING_RULES: "'l.first_name == r.first_name,l.last_name == r.last_name'"
|
51
50
|
LINK_ONLY: true
|
51
|
+
BLOCKING_RULES: "l.first_name == r.first_name,l.last_name == r.last_name"
|
52
52
|
evaluating_pairs:
|
53
53
|
implementation:
|
54
54
|
name: splink_evaluating_pairs
|
55
55
|
configuration:
|
56
|
-
BLOCKING_RULES_FOR_TRAINING: "'l.first_name == r.first_name,l.last_name == r.last_name'"
|
57
|
-
COMPARISONS: "'ssn:exact,first_name:exact,middle_initial:exact,last_name:exact'"
|
58
|
-
PROBABILITY_TWO_RANDOM_RECORDS_MATCH: 0.0001 # == 1 / len(w2)
|
59
|
-
THRESHOLD_MATCH_PROBABILITY: 0
|
60
56
|
LINK_ONLY: true
|
57
|
+
BLOCKING_RULES_FOR_TRAINING: "l.first_name == r.first_name,l.last_name == r.last_name"
|
58
|
+
COMPARISONS: "ssn:levenshtein,first_name:name,middle_initial:exact,last_name:name"
|
59
|
+
PROBABILITY_TWO_RANDOM_RECORDS_MATCH: 0.0001 # == 1 / len(w2)
|
61
60
|
links_to_clusters:
|
62
61
|
implementation:
|
63
62
|
name: splink_links_to_clusters
|
@@ -68,4 +67,4 @@ steps:
|
|
68
67
|
name: default_updating_clusters
|
69
68
|
canonicalizing_and_downstream_analysis:
|
70
69
|
implementation:
|
71
|
-
name:
|
70
|
+
name: save_clusters
|
@@ -0,0 +1,70 @@
|
|
1
|
+
steps:
|
2
|
+
entity_resolution:
|
3
|
+
substeps:
|
4
|
+
determining_exclusions_and_removing_records:
|
5
|
+
clones:
|
6
|
+
- determining_exclusions:
|
7
|
+
implementation:
|
8
|
+
name: default_determining_exclusions
|
9
|
+
configuration:
|
10
|
+
INPUT_DATASET: input_file_ssa
|
11
|
+
removing_records:
|
12
|
+
implementation:
|
13
|
+
name: default_removing_records
|
14
|
+
configuration:
|
15
|
+
INPUT_DATASET: input_file_ssa
|
16
|
+
- determining_exclusions:
|
17
|
+
implementation:
|
18
|
+
name: default_determining_exclusions
|
19
|
+
configuration:
|
20
|
+
INPUT_DATASET: input_file_w2
|
21
|
+
removing_records:
|
22
|
+
implementation:
|
23
|
+
name: default_removing_records
|
24
|
+
configuration:
|
25
|
+
INPUT_DATASET: input_file_w2
|
26
|
+
clustering:
|
27
|
+
substeps:
|
28
|
+
clusters_to_links:
|
29
|
+
implementation:
|
30
|
+
name: default_clusters_to_links
|
31
|
+
linking:
|
32
|
+
substeps:
|
33
|
+
pre-processing:
|
34
|
+
clones:
|
35
|
+
- implementation:
|
36
|
+
name: middle_name_to_initial
|
37
|
+
configuration:
|
38
|
+
INPUT_DATASET: input_file_ssa
|
39
|
+
- implementation:
|
40
|
+
name: no_pre-processing
|
41
|
+
configuration:
|
42
|
+
INPUT_DATASET: input_file_w2
|
43
|
+
schema_alignment:
|
44
|
+
implementation:
|
45
|
+
name: default_schema_alignment
|
46
|
+
blocking_and_filtering:
|
47
|
+
implementation:
|
48
|
+
name: splink_blocking_and_filtering
|
49
|
+
configuration:
|
50
|
+
LINK_ONLY: true
|
51
|
+
BLOCKING_RULES: "l.first_name == r.first_name,l.last_name == r.last_name"
|
52
|
+
evaluating_pairs:
|
53
|
+
implementation:
|
54
|
+
name: splink_evaluating_pairs
|
55
|
+
configuration:
|
56
|
+
LINK_ONLY: true
|
57
|
+
BLOCKING_RULES_FOR_TRAINING: "l.first_name == r.first_name,l.last_name == r.last_name"
|
58
|
+
COMPARISONS: "ssn:exact,first_name:exact,middle_initial:exact,last_name:exact"
|
59
|
+
PROBABILITY_TWO_RANDOM_RECORDS_MATCH: 0.0001 # == 1 / len(w2)
|
60
|
+
links_to_clusters:
|
61
|
+
implementation:
|
62
|
+
name: splink_links_to_clusters
|
63
|
+
configuration:
|
64
|
+
THRESHOLD_MATCH_PROBABILITY: 0.996
|
65
|
+
updating_clusters:
|
66
|
+
implementation:
|
67
|
+
name: default_updating_clusters
|
68
|
+
canonicalizing_and_downstream_analysis:
|
69
|
+
implementation:
|
70
|
+
name: save_clusters
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.1.25"
|
@@ -201,16 +201,20 @@ def run(
|
|
201
201
|
main = handle_exceptions(
|
202
202
|
func=runner.main, exceptions_logger=logger, with_debugger=with_debugger
|
203
203
|
)
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
204
|
+
try:
|
205
|
+
main(
|
206
|
+
command="run",
|
207
|
+
pipeline_specification=pipeline_specification,
|
208
|
+
input_data=input_data,
|
209
|
+
computing_environment=computing_environment,
|
210
|
+
results_dir=results_dir,
|
211
|
+
images_dir=images,
|
212
|
+
schema_name=schema,
|
213
|
+
)
|
214
|
+
except SystemExit:
|
215
|
+
# Snakemake uses SystemExit for completion - log success and re-raise
|
216
|
+
logger.info("*** FINISHED ***")
|
217
|
+
raise
|
214
218
|
|
215
219
|
|
216
220
|
@easylink.command()
|
@@ -243,6 +247,7 @@ def generate_dag(
|
|
243
247
|
input_data=input_data,
|
244
248
|
computing_environment=None,
|
245
249
|
results_dir=results_dir,
|
250
|
+
images_dir=None,
|
246
251
|
schema_name=schema,
|
247
252
|
)
|
248
253
|
logger.info("*** DAG saved to result directory ***")
|
@@ -2,7 +2,7 @@ step_1_python_pandas:
|
|
2
2
|
steps:
|
3
3
|
- step_1
|
4
4
|
image_name: python_pandas.sif
|
5
|
-
zenodo_record_id:
|
5
|
+
zenodo_record_id: 15757317
|
6
6
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
7
7
|
script_cmd: python /dummy_step.py
|
8
8
|
outputs:
|
@@ -11,7 +11,7 @@ step_1a_python_pandas:
|
|
11
11
|
steps:
|
12
12
|
- step_1a
|
13
13
|
image_name: python_pandas.sif
|
14
|
-
zenodo_record_id:
|
14
|
+
zenodo_record_id: 15757317
|
15
15
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
16
16
|
script_cmd: python /dummy_step.py
|
17
17
|
env:
|
@@ -22,7 +22,7 @@ step_1b_python_pandas:
|
|
22
22
|
steps:
|
23
23
|
- step_1b
|
24
24
|
image_name: python_pandas.sif
|
25
|
-
zenodo_record_id:
|
25
|
+
zenodo_record_id: 15757317
|
26
26
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
27
27
|
script_cmd: python /dummy_step.py
|
28
28
|
env:
|
@@ -33,7 +33,7 @@ step_2_python_pandas:
|
|
33
33
|
steps:
|
34
34
|
- step_2
|
35
35
|
image_name: python_pandas.sif
|
36
|
-
zenodo_record_id:
|
36
|
+
zenodo_record_id: 15757317
|
37
37
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
38
38
|
script_cmd: python /dummy_step.py
|
39
39
|
outputs:
|
@@ -42,7 +42,7 @@ step_3_python_pandas:
|
|
42
42
|
steps:
|
43
43
|
- step_3
|
44
44
|
image_name: python_pandas.sif
|
45
|
-
zenodo_record_id:
|
45
|
+
zenodo_record_id: 15757317
|
46
46
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
47
47
|
script_cmd: python /dummy_step.py
|
48
48
|
outputs:
|
@@ -51,7 +51,7 @@ step_4_python_pandas:
|
|
51
51
|
steps:
|
52
52
|
- step_4
|
53
53
|
image_name: python_pandas.sif
|
54
|
-
zenodo_record_id:
|
54
|
+
zenodo_record_id: 15757317
|
55
55
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
56
56
|
script_cmd: python /dummy_step.py
|
57
57
|
env:
|
@@ -62,7 +62,7 @@ step_5_python_pandas:
|
|
62
62
|
steps:
|
63
63
|
- step_5
|
64
64
|
image_name: python_pandas.sif
|
65
|
-
zenodo_record_id:
|
65
|
+
zenodo_record_id: 15757317
|
66
66
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
67
67
|
script_cmd: python /dummy_step.py
|
68
68
|
env:
|
@@ -73,7 +73,7 @@ step_6_python_pandas:
|
|
73
73
|
steps:
|
74
74
|
- step_6
|
75
75
|
image_name: python_pandas.sif
|
76
|
-
zenodo_record_id:
|
76
|
+
zenodo_record_id: 15757317
|
77
77
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
78
78
|
script_cmd: python /dummy_step.py
|
79
79
|
env:
|
@@ -84,7 +84,7 @@ step_4a_python_pandas:
|
|
84
84
|
steps:
|
85
85
|
- step_4a
|
86
86
|
image_name: python_pandas.sif
|
87
|
-
zenodo_record_id:
|
87
|
+
zenodo_record_id: 15757317
|
88
88
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
89
89
|
script_cmd: python /dummy_step.py
|
90
90
|
env:
|
@@ -95,7 +95,7 @@ step_4b_python_pandas:
|
|
95
95
|
steps:
|
96
96
|
- step_4b
|
97
97
|
image_name: python_pandas.sif
|
98
|
-
zenodo_record_id:
|
98
|
+
zenodo_record_id: 15757317
|
99
99
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
100
100
|
script_cmd: python /dummy_step.py
|
101
101
|
env:
|
@@ -106,7 +106,7 @@ step_4b_r:
|
|
106
106
|
steps:
|
107
107
|
- step_4b
|
108
108
|
image_name: r-image.sif
|
109
|
-
zenodo_record_id:
|
109
|
+
zenodo_record_id: 15757317
|
110
110
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
111
111
|
script_cmd: Rscript /dummy_step.R
|
112
112
|
env:
|
@@ -117,7 +117,7 @@ step_1_python_pyspark:
|
|
117
117
|
steps:
|
118
118
|
- step_1
|
119
119
|
image_name: python_pyspark.sif
|
120
|
-
zenodo_record_id:
|
120
|
+
zenodo_record_id: 15757317
|
121
121
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
122
122
|
script_cmd: python3 /code/dummy_step.py
|
123
123
|
outputs:
|
@@ -127,7 +127,7 @@ step_2_python_pyspark:
|
|
127
127
|
steps:
|
128
128
|
- step_2
|
129
129
|
image_name: python_pyspark.sif
|
130
|
-
zenodo_record_id:
|
130
|
+
zenodo_record_id: 15757317
|
131
131
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
132
132
|
script_cmd: python3 /code/dummy_step.py
|
133
133
|
outputs:
|
@@ -137,7 +137,7 @@ step_3_python_pyspark:
|
|
137
137
|
steps:
|
138
138
|
- step_3
|
139
139
|
image_name: python_pyspark.sif
|
140
|
-
zenodo_record_id:
|
140
|
+
zenodo_record_id: 15757317
|
141
141
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
142
142
|
script_cmd: python3 /code/dummy_step.py
|
143
143
|
outputs:
|
@@ -147,7 +147,7 @@ step_4_python_pyspark:
|
|
147
147
|
steps:
|
148
148
|
- step_4
|
149
149
|
image_name: python_pyspark.sif
|
150
|
-
zenodo_record_id:
|
150
|
+
zenodo_record_id: 15757317
|
151
151
|
md5_checksum: c948577ab0607411dd4b640622d9ec3a
|
152
152
|
script_cmd: python3 /code/dummy_step.py
|
153
153
|
env:
|
@@ -158,7 +158,7 @@ step_1_r:
|
|
158
158
|
steps:
|
159
159
|
- step_1
|
160
160
|
image_name: r-image.sif
|
161
|
-
zenodo_record_id:
|
161
|
+
zenodo_record_id: 15757317
|
162
162
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
163
163
|
script_cmd: Rscript /dummy_step.R
|
164
164
|
outputs:
|
@@ -168,7 +168,7 @@ step_2_r:
|
|
168
168
|
steps:
|
169
169
|
- step_2
|
170
170
|
image_name: r-image.sif
|
171
|
-
zenodo_record_id:
|
171
|
+
zenodo_record_id: 15757317
|
172
172
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
173
173
|
script_cmd: Rscript /dummy_step.R
|
174
174
|
outputs:
|
@@ -178,7 +178,7 @@ step_3_r:
|
|
178
178
|
steps:
|
179
179
|
- step_3
|
180
180
|
image_name: r-image.sif
|
181
|
-
zenodo_record_id:
|
181
|
+
zenodo_record_id: 15757317
|
182
182
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
183
183
|
script_cmd: Rscript /dummy_step.R
|
184
184
|
outputs:
|
@@ -188,7 +188,7 @@ step_4_r:
|
|
188
188
|
steps:
|
189
189
|
- step_4
|
190
190
|
image_name: r-image.sif
|
191
|
-
zenodo_record_id:
|
191
|
+
zenodo_record_id: 15757317
|
192
192
|
md5_checksum: a4a03b836694a2b81a1bd2852736ccc5
|
193
193
|
script_cmd: Rscript /dummy_step.R
|
194
194
|
env:
|
@@ -201,7 +201,7 @@ step_1_and_step_2_combined_python_pandas:
|
|
201
201
|
- step_1
|
202
202
|
- step_2
|
203
203
|
image_name: python_pandas.sif
|
204
|
-
zenodo_record_id:
|
204
|
+
zenodo_record_id: 15757317
|
205
205
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
206
206
|
script_cmd: python /dummy_step.py
|
207
207
|
outputs:
|
@@ -211,7 +211,7 @@ step_1_and_step_2_parallel_python_pandas:
|
|
211
211
|
- step_1
|
212
212
|
- step_2
|
213
213
|
image_name: python_pandas.sif
|
214
|
-
zenodo_record_id:
|
214
|
+
zenodo_record_id: 15757317
|
215
215
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
216
216
|
script_cmd: python /dummy_step.py
|
217
217
|
env:
|
@@ -223,7 +223,7 @@ step_3_and_step_4_combined_python_pandas:
|
|
223
223
|
- step_3
|
224
224
|
- step_4
|
225
225
|
image_name: python_pandas.sif
|
226
|
-
zenodo_record_id:
|
226
|
+
zenodo_record_id: 15757317
|
227
227
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
228
228
|
script_cmd: python /dummy_step.py
|
229
229
|
outputs:
|
@@ -233,7 +233,7 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
233
233
|
- step_1a
|
234
234
|
- step_1b
|
235
235
|
image_name: python_pandas.sif
|
236
|
-
zenodo_record_id:
|
236
|
+
zenodo_record_id: 15757317
|
237
237
|
md5_checksum: 9177b8e168fcc9cae91bf61265f2185c
|
238
238
|
script_cmd: python /dummy_step.py
|
239
239
|
outputs:
|
@@ -241,131 +241,157 @@ step_1a_and_step_1b_combined_python_pandas:
|
|
241
241
|
dummy_step_1_for_output_dir_example:
|
242
242
|
steps:
|
243
243
|
- step_1_for_output_dir_example
|
244
|
-
image_name:
|
244
|
+
image_name: dummy_step_1_for_output_dir_example.sif
|
245
245
|
script_cmd: python /dummy_step_1_for_output_dir_example.py
|
246
246
|
outputs:
|
247
247
|
step_1_main_output_directory: output_dir/
|
248
248
|
dummy_step_1_for_output_dir_example_default:
|
249
249
|
steps:
|
250
250
|
- step_1_for_output_dir_example
|
251
|
-
image_name:
|
251
|
+
image_name: dummy_step_1_for_output_dir_example.sif
|
252
252
|
script_cmd: python /dummy_step_1_for_output_dir_example.py
|
253
253
|
dummy_step_2_for_output_dir_example:
|
254
254
|
steps:
|
255
255
|
- step_2_for_output_dir_example
|
256
|
-
image_name:
|
256
|
+
image_name: dummy_step_2_for_output_dir_example.sif
|
257
257
|
script_cmd: python /dummy_step_2_for_output_dir_example.py
|
258
258
|
outputs:
|
259
259
|
step_2_main_output: result.parquet
|
260
260
|
default_removing_records:
|
261
261
|
steps:
|
262
262
|
- removing_records
|
263
|
-
image_name:
|
263
|
+
image_name: default_removing_records.sif
|
264
|
+
zenodo_record_id: 15757317
|
265
|
+
md5_checksum: 85dba6fd73c9f8f504fddb6d5c30f2de
|
264
266
|
script_cmd: python /default_removing_records.py
|
265
267
|
outputs:
|
266
268
|
dataset: dataset
|
267
269
|
default_clusters_to_links:
|
268
270
|
steps:
|
269
271
|
- clusters_to_links
|
270
|
-
image_name:
|
272
|
+
image_name: default_clusters_to_links.sif
|
273
|
+
zenodo_record_id: 15757317
|
274
|
+
md5_checksum: 0d00d1272bd8193f60727791097aa065
|
271
275
|
script_cmd: python /default_clusters_to_links.py
|
272
276
|
outputs:
|
273
277
|
known_links: result.parquet
|
274
278
|
default_determining_exclusions:
|
275
279
|
steps:
|
276
280
|
- determining_exclusions
|
277
|
-
image_name:
|
281
|
+
image_name: default_determining_exclusions.sif
|
282
|
+
zenodo_record_id: 15757317
|
283
|
+
md5_checksum: e61cb32ad45b79ca9a2c36db4e76ef7e
|
278
284
|
script_cmd: python /default_determining_exclusions.py
|
279
285
|
outputs:
|
280
286
|
ids_to_remove: result.parquet
|
281
287
|
default_updating_clusters:
|
282
288
|
steps:
|
283
289
|
- updating_clusters
|
284
|
-
image_name:
|
290
|
+
image_name: default_updating_clusters.sif
|
291
|
+
zenodo_record_id: 15757317
|
292
|
+
md5_checksum: cc6bd29e099c2523347fa04545aa35c9
|
285
293
|
script_cmd: python /default_updating_clusters.py
|
286
294
|
outputs:
|
287
295
|
clusters: clusters.parquet
|
288
|
-
dummy_canonicalizing_and_downstream_analysis
|
296
|
+
# NOTE: This was made from dummy_canonicalizing_and_downstream_analysis.py,
|
297
|
+
# if rebuilding change the name of that file to save_clusters.py
|
298
|
+
save_clusters:
|
289
299
|
steps:
|
290
300
|
- canonicalizing_and_downstream_analysis
|
291
|
-
image_name:
|
301
|
+
image_name: save_clusters.sif
|
302
|
+
zenodo_record_id: 15757317
|
303
|
+
md5_checksum: 384ab2be668cbadc45160a674f621022
|
292
304
|
script_cmd: python /dummy_canonicalizing_and_downstream_analysis.py
|
293
305
|
outputs:
|
294
306
|
analysis_output: result.parquet
|
295
|
-
dummy_pre-processing
|
307
|
+
# NOTE: This was made from dummy_pre-processing.py,
|
308
|
+
# if rebuilding change the name of that file to no_pre-processing.py
|
309
|
+
no_pre-processing:
|
296
310
|
steps:
|
297
311
|
- pre-processing
|
298
|
-
image_name:
|
312
|
+
image_name: no_pre-processing.sif
|
313
|
+
zenodo_record_id: 15757317
|
314
|
+
md5_checksum: 9a9c080cf145078152501cf96bf61f27
|
299
315
|
script_cmd: python /dummy_pre-processing.py
|
300
316
|
outputs:
|
301
317
|
dataset: dataset
|
302
318
|
default_schema_alignment:
|
303
319
|
steps:
|
304
320
|
- schema_alignment
|
305
|
-
image_name:
|
321
|
+
image_name: default_schema_alignment.sif
|
322
|
+
zenodo_record_id: 15757317
|
323
|
+
md5_checksum: 3166587f9cfec478b999a17074d628f7
|
306
324
|
script_cmd: python /default_schema_alignment.py
|
307
325
|
outputs:
|
308
326
|
records: result.parquet
|
309
327
|
splink_blocking_and_filtering:
|
310
328
|
steps:
|
311
329
|
- blocking_and_filtering
|
312
|
-
image_name:
|
330
|
+
image_name: splink_blocking_and_filtering.sif
|
331
|
+
zenodo_record_id: 15757317
|
332
|
+
md5_checksum: 8a365b90295ef6beaad2b7f80a03d768
|
313
333
|
script_cmd: python /splink_blocking_and_filtering.py
|
314
334
|
outputs:
|
315
335
|
blocks: blocks
|
316
336
|
splink_evaluating_pairs:
|
317
337
|
steps:
|
318
338
|
- evaluating_pairs
|
319
|
-
image_name:
|
339
|
+
image_name: splink_evaluating_pairs.sif
|
340
|
+
zenodo_record_id: 15757317
|
341
|
+
md5_checksum: b57f4bd16b7a3aa5099569078ea4c064
|
320
342
|
script_cmd: python /splink_evaluating_pairs.py
|
321
343
|
outputs:
|
322
344
|
links: result.parquet
|
323
345
|
splink_links_to_clusters:
|
324
346
|
steps:
|
325
347
|
- links_to_clusters
|
326
|
-
image_name:
|
348
|
+
image_name: splink_links_to_clusters.sif
|
349
|
+
zenodo_record_id: 15757317
|
350
|
+
md5_checksum: 645937f7bab9c2557b7aacafaf4e4765
|
327
351
|
script_cmd: python /splink_links_to_clusters.py
|
328
352
|
outputs:
|
329
353
|
clusters: result.parquet
|
330
354
|
fastLink_evaluating_pairs:
|
331
355
|
steps:
|
332
356
|
- evaluating_pairs
|
333
|
-
image_name:
|
357
|
+
image_name: fastLink_evaluating_pairs.sif
|
334
358
|
script_cmd: Rscript /fastLink_evaluating_pairs.R
|
335
359
|
outputs:
|
336
360
|
links: result.parquet
|
337
361
|
fastLink_links_to_clusters:
|
338
362
|
steps:
|
339
363
|
- links_to_clusters
|
340
|
-
image_name:
|
364
|
+
image_name: fastLink_links_to_clusters.sif
|
341
365
|
script_cmd: Rscript /fastLink_links_to_clusters.R
|
342
366
|
outputs:
|
343
367
|
clusters: result.parquet
|
344
368
|
exclude_clustered:
|
345
369
|
steps:
|
346
370
|
- determining_exclusions
|
347
|
-
image_name:
|
371
|
+
image_name: exclude_clustered.sif
|
348
372
|
script_cmd: python /exclude_clustered.py
|
349
373
|
outputs:
|
350
374
|
ids_to_remove: result.parquet
|
351
375
|
exclude_none:
|
352
376
|
steps:
|
353
377
|
- determining_exclusions
|
354
|
-
image_name:
|
378
|
+
image_name: exclude_none.sif
|
355
379
|
script_cmd: python /exclude_none.py
|
356
380
|
outputs:
|
357
381
|
ids_to_remove: result.parquet
|
358
382
|
update_clusters_by_connected_components:
|
359
383
|
steps:
|
360
384
|
- updating_clusters
|
361
|
-
image_name:
|
385
|
+
image_name: update_clusters_by_connected_components.sif
|
362
386
|
script_cmd: python /update_clusters_by_connected_components.py
|
363
387
|
outputs:
|
364
388
|
clusters: result.parquet
|
365
389
|
middle_name_to_initial:
|
366
390
|
steps:
|
367
391
|
- pre-processing
|
368
|
-
image_name:
|
392
|
+
image_name: middle_name_to_initial.sif
|
393
|
+
zenodo_record_id: 15757317
|
394
|
+
md5_checksum: 89db9c3318300cda9d538cde08c3c323
|
369
395
|
script_cmd: python /middle_name_to_initial.py
|
370
396
|
outputs:
|
371
397
|
dataset: dataset
|