splink 4.0.0.dev5__tar.gz → 4.0.0.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/PKG-INFO +48 -44
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/README.md +47 -42
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/pyproject.toml +1 -2
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/__init__.py +1 -1
- splink-4.0.0.dev8/splink/backends/athena.py +3 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/blocking_analysis.py +2 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/comparison_library.py +10 -0
- splink-4.0.0.dev8/splink/exploratory.py +5 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/accuracy.py +24 -21
- splink-4.0.0.dev8/splink/internals/athena/database_api.py +266 -0
- splink-4.0.0.dev8/splink/internals/athena/dataframe.py +119 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/blocking.py +30 -31
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/blocking_analysis.py +104 -22
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/blocking_rule_library.py +29 -2
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/cluster_studio.py +7 -7
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/comparison.py +2 -1
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/comparison_creator.py +13 -8
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/comparison_level_composition.py +0 -1
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/comparison_level_library.py +1 -1
- splink-4.0.0.dev8/splink/internals/comparison_library.py +1120 -0
- splink-4.0.0.dev8/splink/internals/comparison_vector_values.py +96 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/completeness.py +0 -3
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/connected_components.py +14 -48
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/database_api.py +13 -2
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/databricks/enable_splink.py +16 -18
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/dialects.py +21 -10
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/duckdb/dataframe.py +3 -1
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/edge_metrics.py +6 -6
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/em_training_session.py +36 -22
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/estimate_u.py +13 -10
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/find_brs_with_comparison_counts_below_threshold.py +20 -3
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/find_matches_to_new_records.py +16 -5
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/input_column.py +5 -5
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/labelling_tool.py +2 -2
- splink-4.0.0.dev8/splink/internals/linker.py +766 -0
- splink-4.0.0.dev8/splink/internals/linker_components/clustering.py +278 -0
- splink-4.0.0.dev8/splink/internals/linker_components/evaluation.py +393 -0
- splink-4.0.0.dev8/splink/internals/linker_components/inference.py +614 -0
- splink-4.0.0.dev8/splink/internals/linker_components/misc.py +89 -0
- splink-4.0.0.dev8/splink/internals/linker_components/table_management.py +217 -0
- splink-4.0.0.dev8/splink/internals/linker_components/training.py +476 -0
- splink-4.0.0.dev8/splink/internals/linker_components/visualisations.py +412 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/m_from_labels.py +5 -2
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/m_training.py +18 -7
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/match_weights_histogram.py +10 -3
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/optimise_cost_of_brs.py +2 -3
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/predict.py +1 -6
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/profile_data.py +1 -1
- splink-4.0.0.dev5/splink/internals/comparison_helpers.py → splink-4.0.0.dev8/splink/internals/similarity_analysis.py +39 -21
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/spark/database_api.py +9 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/splink_dataframe.py +8 -8
- splink-4.0.0.dev8/splink/internals/sqlite/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/term_frequencies.py +5 -3
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/unlinkables.py +2 -2
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/vertically_concatenate.py +2 -2
- splink-4.0.0.dev5/splink/comparison_template_library.py +0 -15
- splink-4.0.0.dev5/splink/exploratory.py +0 -4
- splink-4.0.0.dev5/splink/internals/athena/linker.py +0 -563
- splink-4.0.0.dev5/splink/internals/comparison_library.py +0 -646
- splink-4.0.0.dev5/splink/internals/comparison_template_library.py +0 -666
- splink-4.0.0.dev5/splink/internals/comparison_vector_values.py +0 -30
- splink-4.0.0.dev5/splink/internals/linker.py +0 -2835
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/LICENSE +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/backends/spark.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/backends/sqlite.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/comparison_level_library.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/datasets.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/athena/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/athena/athena_helpers/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/athena/athena_helpers/athena_transforms.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/athena/athena_helpers/athena_utils.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/block_from_labels.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/blocking_rule_creator.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/blocking_rule_creator_utils.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/cache_dict_with_logging.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/charts.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/column_expression.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/comparison_level.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/comparison_level_creator.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/comparison_level_sql.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/comparison_vector_distribution.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/constants.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/cost_of_blocking_rules.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/databricks/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/datasets/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/default_from_jsonschema.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/duckdb/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/duckdb/database_api.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/duckdb/duckdb_helpers/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/duckdb/duckdb_helpers/duckdb_helpers.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/exceptions.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/expectation_maximisation.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/DEPENDENCY_LICENSES.txt +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/accuracy_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/blocking_rule_generated_comparisons.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/comparator_score_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/comparator_score_threshold_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/completeness.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/m_u_parameters_interactive_history.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/match_weight_histogram.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/match_weights_interactive_history.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/match_weights_waterfall.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/missingness.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/parameter_estimate_comparisons.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/phonetic_match_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/precision_recall.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/probability_two_random_records_match_iteration.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/profile_data.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/roc.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/tf_adjustment_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/threshold_selection_tool.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/unlinkables_chart_def.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/external_js/d3@7.8.5 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/external_js/stdlib.js@5.8.3 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/external_js/vega-embed@6.20.2 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/external_js/vega-lite@5.2.0 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/external_js/vega@5.21.0 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/labelling_tool/slt.js +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/labelling_tool/template.j2 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/settings_jsonschema.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_classic.jar +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_spark3.3.jar +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.1_spark3.x.jar +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/splink_cluster_studio/cluster_template.j2 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/splink_cluster_studio/custom.css +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/splink_comparison_viewer/custom.css +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/splink_comparison_viewer/template.j2 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/splink_vis_utils/splink_vis_utils.js +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/files/templates/single_chart_template.html +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/graph_metrics.py +0 -0
- {splink-4.0.0.dev5/splink/internals/postgres → splink-4.0.0.dev8/splink/internals/linker_components}/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/logging_messages.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/lower_id_on_lhs.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/m_u_records_to_parameters.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/match_key_analysis.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/misc.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/parse_sql.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/pipeline.py +0 -0
- {splink-4.0.0.dev5/splink/internals/settings_validation → splink-4.0.0.dev8/splink/internals/postgres}/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/postgres/database_api.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/postgres/dataframe.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/settings.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/settings_creator.py +0 -0
- {splink-4.0.0.dev5/splink/internals/spark → splink-4.0.0.dev8/splink/internals/settings_validation}/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/settings_validation/log_invalid_columns.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/settings_validation/settings_column_cleaner.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/settings_validation/settings_validation_log_strings.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/settings_validation/valid_types.py +0 -0
- {splink-4.0.0.dev5/splink/internals/spark/spark_helpers → splink-4.0.0.dev8/splink/internals/spark}/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/spark/dataframe.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/spark/jar_location.py +0 -0
- {splink-4.0.0.dev5/splink/internals/sqlite → splink-4.0.0.dev8/splink/internals/spark/spark_helpers}/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/spark/spark_helpers/custom_spark_dialect.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/splink_comparison_viewer.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/sql_transform.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/sqlite/database_api.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/sqlite/dataframe.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/unique_id_concat.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/validate_jsonschema.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev8}/splink/internals/waterfall_chart.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: splink
|
|
3
|
-
Version: 4.0.0.
|
|
3
|
+
Version: 4.0.0.dev8
|
|
4
4
|
Summary: Fast probabilistic data linkage at scale
|
|
5
5
|
Home-page: https://github.com/moj-analytical-services/splink
|
|
6
6
|
License: MIT
|
|
@@ -27,7 +27,6 @@ Requires-Dist: jsonschema (>=3.2)
|
|
|
27
27
|
Requires-Dist: numpy (>=1.17.3) ; python_version < "3.12"
|
|
28
28
|
Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
|
|
29
29
|
Requires-Dist: pandas (>1.3.5)
|
|
30
|
-
Requires-Dist: phonetics (>=1.0.5)
|
|
31
30
|
Requires-Dist: psycopg2-binary (>=2.8.0) ; extra == "postgres"
|
|
32
31
|
Requires-Dist: pyspark (>=3.2.1) ; extra == "pyspark" or extra == "spark"
|
|
33
32
|
Requires-Dist: sqlglot (>=13.0.0)
|
|
@@ -51,11 +50,11 @@ Splink is a Python package for probabilistic record linkage (entity resolution)
|
|
|
51
50
|
|
|
52
51
|
## Key Features
|
|
53
52
|
|
|
54
|
-
⚡ **Speed:** Capable of linking a million records on a laptop in around a minute
|
|
55
|
-
🎯 **Accuracy:** Support for term frequency adjustments and user-defined fuzzy matching logic
|
|
56
|
-
🌐 **Scalability:** Execute linkage in Python (using DuckDB) or big-data backends like AWS Athena or Spark for 100+ million records
|
|
57
|
-
🎓 **Unsupervised Learning:** No training data is required for model training
|
|
58
|
-
📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems
|
|
53
|
+
⚡ **Speed:** Capable of linking a million records on a laptop in around a minute.<br>
|
|
54
|
+
🎯 **Accuracy:** Support for term frequency adjustments and user-defined fuzzy matching logic.<br>
|
|
55
|
+
🌐 **Scalability:** Execute linkage in Python (using DuckDB) or big-data backends like AWS Athena or Spark for 100+ million records.<br>
|
|
56
|
+
🎓 **Unsupervised Learning:** No training data is required for model training.<br>
|
|
57
|
+
📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.<br>
|
|
59
58
|
|
|
60
59
|
Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customisations to improve accuracy.
|
|
61
60
|
|
|
@@ -75,19 +74,16 @@ and clusters these links to produce an estimated person ID:
|
|
|
75
74
|
|
|
76
75
|
## What data does Splink work best with?
|
|
77
76
|
|
|
78
|
-
Before using Splink, input data should be standardised, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
|
|
79
|
-
|
|
80
77
|
Splink performs best with input data containing **multiple** columns that are **not highly correlated**. For instance, if the entity type is persons, you may have columns for full name, date of birth, and city. If the entity type is companies, you could have columns for name, turnover, sector, and telephone number.
|
|
81
78
|
|
|
82
|
-
High correlation occurs when
|
|
79
|
+
High correlation occurs when one column is highly predictable from another - for instance, city can be predicted from postcode. Correlation is particularly problematic if **all** of your input columns are highly correlated.
|
|
83
80
|
|
|
84
81
|
Splink is not designed for linking a single column containing a 'bag of words'. For example, a table with a single 'company name' column, and no other details.
|
|
85
82
|
|
|
86
83
|
## Documentation
|
|
87
84
|
|
|
88
|
-
The homepage for the Splink documentation can be found [here](https://moj-analytical-services.github.io/splink/)
|
|
85
|
+
The homepage for the Splink documentation can be found [here](https://moj-analytical-services.github.io/splink/), including a [tutorial](https://moj-analytical-services.github.io/splink/demos/tutorials/00_Tutorial_Introduction.html) and [examples](https://moj-analytical-services.github.io/splink/demos/examples/examples_index.html) that can be run in the browser.
|
|
89
86
|
|
|
90
|
-
[](https://mybinder.org/v2/gh/moj-analytical-services/splink/binder_branch?labpath=docs%2Fdemos%2Ftutorials%2F00_Tutorial_Introduction.ipynb)
|
|
91
87
|
|
|
92
88
|
The specification of the Fellegi Sunter statistical model behind `splink` is similar as that used in the R [fastLink package](https://github.com/kosukeimai/fastLink). Accompanying the fastLink package is an [academic paper](http://imai.fas.harvard.edu/research/files/linkage.pdf) that describes this model. The [Splink documentation site](https://moj-analytical-services.github.io/splink/topic_guides/fellegi_sunter.html) and a [series of interactive articles](https://www.robinlinacre.com/probabilistic_linkage/) also explores the theory behind Splink.
|
|
93
89
|
|
|
@@ -143,43 +139,56 @@ The following code demonstrates how to estimate the parameters of a deduplicatio
|
|
|
143
139
|
For more detailed tutorial, please see [here](https://moj-analytical-services.github.io/splink/demos/tutorials/00_Tutorial_Introduction.html).
|
|
144
140
|
|
|
145
141
|
```py
|
|
146
|
-
|
|
147
|
-
import splink.
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
142
|
+
import splink.comparison_library as cl
|
|
143
|
+
import splink.comparison_template_library as ctl
|
|
144
|
+
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
|
|
145
|
+
|
|
146
|
+
db_api = DuckDBAPI()
|
|
151
147
|
|
|
152
148
|
df = splink_datasets.fake_1000
|
|
153
149
|
|
|
154
|
-
settings =
|
|
155
|
-
|
|
156
|
-
|
|
150
|
+
settings = SettingsCreator(
|
|
151
|
+
link_type="dedupe_only",
|
|
152
|
+
comparisons=[
|
|
153
|
+
cl.JaroWinklerAtThresholds("first_name", [0.9, 0.7]),
|
|
154
|
+
cl.JaroAtThresholds("surname", [0.9, 0.7]),
|
|
155
|
+
ctl.DateComparison(
|
|
156
|
+
"dob",
|
|
157
|
+
input_is_string=True,
|
|
158
|
+
datetime_metrics=["year", "month"],
|
|
159
|
+
datetime_thresholds=[1, 1],
|
|
160
|
+
),
|
|
161
|
+
cl.ExactMatch("city").configure(term_frequency_adjustments=True),
|
|
162
|
+
ctl.EmailComparison("email"),
|
|
163
|
+
],
|
|
164
|
+
blocking_rules_to_generate_predictions=[
|
|
157
165
|
block_on("first_name"),
|
|
158
166
|
block_on("surname"),
|
|
159
|
-
]
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
167
|
+
]
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
linker = Linker(df, settings, db_api)
|
|
171
|
+
|
|
172
|
+
linker.training.estimate_probability_two_random_records_match(
|
|
173
|
+
[block_on("first_name", "surname")],
|
|
174
|
+
recall=0.7,
|
|
175
|
+
)
|
|
168
176
|
|
|
169
|
-
linker
|
|
170
|
-
linker.estimate_u_using_random_sampling(max_pairs=1e6)
|
|
177
|
+
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
|
|
171
178
|
|
|
172
|
-
|
|
179
|
+
linker.training.estimate_parameters_using_expectation_maximisation(
|
|
180
|
+
block_on("first_name", "surname")
|
|
181
|
+
)
|
|
173
182
|
|
|
174
|
-
linker.estimate_parameters_using_expectation_maximisation(
|
|
183
|
+
linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
|
|
175
184
|
|
|
176
|
-
|
|
177
|
-
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
|
|
185
|
+
pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)
|
|
178
186
|
|
|
179
|
-
|
|
187
|
+
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
|
|
188
|
+
pairwise_predictions, 0.95
|
|
189
|
+
)
|
|
180
190
|
|
|
181
|
-
|
|
182
|
-
clusters.as_pandas_dataframe(limit=5)
|
|
191
|
+
df_clusters = clusters.as_pandas_dataframe(limit=5)
|
|
183
192
|
```
|
|
184
193
|
|
|
185
194
|
## Videos
|
|
@@ -187,13 +196,10 @@ clusters.as_pandas_dataframe(limit=5)
|
|
|
187
196
|
- [A introductory presentation on Splink](https://www.youtube.com/watch?v=msz3T741KQI)
|
|
188
197
|
- [An introduction to the Splink Comparison Viewer dashboard](https://www.youtube.com/watch?v=DNvCMqjipis)
|
|
189
198
|
|
|
190
|
-
## Charts Gallery
|
|
191
|
-
|
|
192
|
-
You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](https://moj-analytical-services.github.io/splink/charts/index.html).
|
|
193
199
|
|
|
194
200
|
## Support
|
|
195
201
|
|
|
196
|
-
To find the best place to ask a question, report a bug or get general advice, please refer to our [
|
|
202
|
+
To find the best place to ask a question, report a bug or get general advice, please refer to our [Guide](./CONTRIBUTING.md).
|
|
197
203
|
|
|
198
204
|
## Use Cases
|
|
199
205
|
|
|
@@ -201,8 +207,6 @@ To see how users are using Splink in the wild, check out the [Use Cases](https:/
|
|
|
201
207
|
|
|
202
208
|
## Awards
|
|
203
209
|
|
|
204
|
-
❓ Future of Government Awards 2023: Open Source Creation - [Shortlisted, result to be announced shortly](https://futureofgovernment.com/en)
|
|
205
|
-
|
|
206
210
|
🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
|
|
207
211
|
|
|
208
212
|
🥇 Analysis in Government Awards 2022: People's Choice Award - [Winner](https://analysisfunction.civilservice.gov.uk/news/announcing-the-winner-of-the-first-analysis-in-government-peoples-choice-award/)
|
|
@@ -15,11 +15,11 @@ Splink is a Python package for probabilistic record linkage (entity resolution)
|
|
|
15
15
|
|
|
16
16
|
## Key Features
|
|
17
17
|
|
|
18
|
-
⚡ **Speed:** Capable of linking a million records on a laptop in around a minute
|
|
19
|
-
🎯 **Accuracy:** Support for term frequency adjustments and user-defined fuzzy matching logic
|
|
20
|
-
🌐 **Scalability:** Execute linkage in Python (using DuckDB) or big-data backends like AWS Athena or Spark for 100+ million records
|
|
21
|
-
🎓 **Unsupervised Learning:** No training data is required for model training
|
|
22
|
-
📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems
|
|
18
|
+
⚡ **Speed:** Capable of linking a million records on a laptop in around a minute.<br>
|
|
19
|
+
🎯 **Accuracy:** Support for term frequency adjustments and user-defined fuzzy matching logic.<br>
|
|
20
|
+
🌐 **Scalability:** Execute linkage in Python (using DuckDB) or big-data backends like AWS Athena or Spark for 100+ million records.<br>
|
|
21
|
+
🎓 **Unsupervised Learning:** No training data is required for model training.<br>
|
|
22
|
+
📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.<br>
|
|
23
23
|
|
|
24
24
|
Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customisations to improve accuracy.
|
|
25
25
|
|
|
@@ -39,19 +39,16 @@ and clusters these links to produce an estimated person ID:
|
|
|
39
39
|
|
|
40
40
|
## What data does Splink work best with?
|
|
41
41
|
|
|
42
|
-
Before using Splink, input data should be standardised, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
|
|
43
|
-
|
|
44
42
|
Splink performs best with input data containing **multiple** columns that are **not highly correlated**. For instance, if the entity type is persons, you may have columns for full name, date of birth, and city. If the entity type is companies, you could have columns for name, turnover, sector, and telephone number.
|
|
45
43
|
|
|
46
|
-
High correlation occurs when
|
|
44
|
+
High correlation occurs when one column is highly predictable from another - for instance, city can be predicted from postcode. Correlation is particularly problematic if **all** of your input columns are highly correlated.
|
|
47
45
|
|
|
48
46
|
Splink is not designed for linking a single column containing a 'bag of words'. For example, a table with a single 'company name' column, and no other details.
|
|
49
47
|
|
|
50
48
|
## Documentation
|
|
51
49
|
|
|
52
|
-
The homepage for the Splink documentation can be found [here](https://moj-analytical-services.github.io/splink/)
|
|
50
|
+
The homepage for the Splink documentation can be found [here](https://moj-analytical-services.github.io/splink/), including a [tutorial](https://moj-analytical-services.github.io/splink/demos/tutorials/00_Tutorial_Introduction.html) and [examples](https://moj-analytical-services.github.io/splink/demos/examples/examples_index.html) that can be run in the browser.
|
|
53
51
|
|
|
54
|
-
[](https://mybinder.org/v2/gh/moj-analytical-services/splink/binder_branch?labpath=docs%2Fdemos%2Ftutorials%2F00_Tutorial_Introduction.ipynb)
|
|
55
52
|
|
|
56
53
|
The specification of the Fellegi Sunter statistical model behind `splink` is similar as that used in the R [fastLink package](https://github.com/kosukeimai/fastLink). Accompanying the fastLink package is an [academic paper](http://imai.fas.harvard.edu/research/files/linkage.pdf) that describes this model. The [Splink documentation site](https://moj-analytical-services.github.io/splink/topic_guides/fellegi_sunter.html) and a [series of interactive articles](https://www.robinlinacre.com/probabilistic_linkage/) also explores the theory behind Splink.
|
|
57
54
|
|
|
@@ -107,43 +104,56 @@ The following code demonstrates how to estimate the parameters of a deduplicatio
|
|
|
107
104
|
For more detailed tutorial, please see [here](https://moj-analytical-services.github.io/splink/demos/tutorials/00_Tutorial_Introduction.html).
|
|
108
105
|
|
|
109
106
|
```py
|
|
110
|
-
|
|
111
|
-
import splink.
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
107
|
+
import splink.comparison_library as cl
|
|
108
|
+
import splink.comparison_template_library as ctl
|
|
109
|
+
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
|
|
110
|
+
|
|
111
|
+
db_api = DuckDBAPI()
|
|
115
112
|
|
|
116
113
|
df = splink_datasets.fake_1000
|
|
117
114
|
|
|
118
|
-
settings =
|
|
119
|
-
|
|
120
|
-
|
|
115
|
+
settings = SettingsCreator(
|
|
116
|
+
link_type="dedupe_only",
|
|
117
|
+
comparisons=[
|
|
118
|
+
cl.JaroWinklerAtThresholds("first_name", [0.9, 0.7]),
|
|
119
|
+
cl.JaroAtThresholds("surname", [0.9, 0.7]),
|
|
120
|
+
ctl.DateComparison(
|
|
121
|
+
"dob",
|
|
122
|
+
input_is_string=True,
|
|
123
|
+
datetime_metrics=["year", "month"],
|
|
124
|
+
datetime_thresholds=[1, 1],
|
|
125
|
+
),
|
|
126
|
+
cl.ExactMatch("city").configure(term_frequency_adjustments=True),
|
|
127
|
+
ctl.EmailComparison("email"),
|
|
128
|
+
],
|
|
129
|
+
blocking_rules_to_generate_predictions=[
|
|
121
130
|
block_on("first_name"),
|
|
122
131
|
block_on("surname"),
|
|
123
|
-
]
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
+
]
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
linker = Linker(df, settings, db_api)
|
|
136
|
+
|
|
137
|
+
linker.training.estimate_probability_two_random_records_match(
|
|
138
|
+
[block_on("first_name", "surname")],
|
|
139
|
+
recall=0.7,
|
|
140
|
+
)
|
|
132
141
|
|
|
133
|
-
linker
|
|
134
|
-
linker.estimate_u_using_random_sampling(max_pairs=1e6)
|
|
142
|
+
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
|
|
135
143
|
|
|
136
|
-
|
|
144
|
+
linker.training.estimate_parameters_using_expectation_maximisation(
|
|
145
|
+
block_on("first_name", "surname")
|
|
146
|
+
)
|
|
137
147
|
|
|
138
|
-
linker.estimate_parameters_using_expectation_maximisation(
|
|
148
|
+
linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
|
|
139
149
|
|
|
140
|
-
|
|
141
|
-
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
|
|
150
|
+
pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)
|
|
142
151
|
|
|
143
|
-
|
|
152
|
+
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
|
|
153
|
+
pairwise_predictions, 0.95
|
|
154
|
+
)
|
|
144
155
|
|
|
145
|
-
|
|
146
|
-
clusters.as_pandas_dataframe(limit=5)
|
|
156
|
+
df_clusters = clusters.as_pandas_dataframe(limit=5)
|
|
147
157
|
```
|
|
148
158
|
|
|
149
159
|
## Videos
|
|
@@ -151,13 +161,10 @@ clusters.as_pandas_dataframe(limit=5)
|
|
|
151
161
|
- [A introductory presentation on Splink](https://www.youtube.com/watch?v=msz3T741KQI)
|
|
152
162
|
- [An introduction to the Splink Comparison Viewer dashboard](https://www.youtube.com/watch?v=DNvCMqjipis)
|
|
153
163
|
|
|
154
|
-
## Charts Gallery
|
|
155
|
-
|
|
156
|
-
You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](https://moj-analytical-services.github.io/splink/charts/index.html).
|
|
157
164
|
|
|
158
165
|
## Support
|
|
159
166
|
|
|
160
|
-
To find the best place to ask a question, report a bug or get general advice, please refer to our [
|
|
167
|
+
To find the best place to ask a question, report a bug or get general advice, please refer to our [Guide](./CONTRIBUTING.md).
|
|
161
168
|
|
|
162
169
|
## Use Cases
|
|
163
170
|
|
|
@@ -165,8 +172,6 @@ To see how users are using Splink in the wild, check out the [Use Cases](https:/
|
|
|
165
172
|
|
|
166
173
|
## Awards
|
|
167
174
|
|
|
168
|
-
❓ Future of Government Awards 2023: Open Source Creation - [Shortlisted, result to be announced shortly](https://futureofgovernment.com/en)
|
|
169
|
-
|
|
170
175
|
🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
|
|
171
176
|
|
|
172
177
|
🥇 Analysis in Government Awards 2022: People's Choice Award - [Winner](https://analysisfunction.civilservice.gov.uk/news/announcing-the-winner-of-the-first-analysis-in-government-peoples-choice-award/)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "splink"
|
|
3
|
-
version = "4.0.0.
|
|
3
|
+
version = "4.0.0.dev8"
|
|
4
4
|
description = "Fast probabilistic data linkage at scale"
|
|
5
5
|
authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"]
|
|
6
6
|
license = "MIT"
|
|
@@ -17,7 +17,6 @@ duckdb = ">=0.9.2"
|
|
|
17
17
|
sqlglot = ">=13.0.0"
|
|
18
18
|
altair = "^5.0.1"
|
|
19
19
|
Jinja2 = ">=3.0.3"
|
|
20
|
-
phonetics = ">=1.0.5"
|
|
21
20
|
|
|
22
21
|
# need to manually specify numpy versions suitable for CI
|
|
23
22
|
# 1.24.4 works with python 3.8, but not 3.12
|
|
@@ -2,10 +2,12 @@ from .internals.blocking_analysis import (
|
|
|
2
2
|
count_comparisons_from_blocking_rule,
|
|
3
3
|
cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
|
|
4
4
|
cumulative_comparisons_to_be_scored_from_blocking_rules_data,
|
|
5
|
+
n_largest_blocks,
|
|
5
6
|
)
|
|
6
7
|
|
|
7
8
|
__all__ = [
|
|
8
9
|
"count_comparisons_from_blocking_rule",
|
|
9
10
|
"cumulative_comparisons_to_be_scored_from_blocking_rules_chart",
|
|
10
11
|
"cumulative_comparisons_to_be_scored_from_blocking_rules_data",
|
|
12
|
+
"n_largest_blocks",
|
|
11
13
|
]
|
|
@@ -4,13 +4,18 @@ from splink.internals.comparison_library import (
|
|
|
4
4
|
ArrayIntersectAtSizes,
|
|
5
5
|
CustomComparison,
|
|
6
6
|
DamerauLevenshteinAtThresholds,
|
|
7
|
+
DateOfBirthComparison,
|
|
7
8
|
DistanceFunctionAtThresholds,
|
|
8
9
|
DistanceInKMAtThresholds,
|
|
10
|
+
EmailComparison,
|
|
9
11
|
ExactMatch,
|
|
12
|
+
ForenameSurnameComparison,
|
|
10
13
|
JaccardAtThresholds,
|
|
11
14
|
JaroAtThresholds,
|
|
12
15
|
JaroWinklerAtThresholds,
|
|
13
16
|
LevenshteinAtThresholds,
|
|
17
|
+
NameComparison,
|
|
18
|
+
PostcodeComparison,
|
|
14
19
|
)
|
|
15
20
|
|
|
16
21
|
__all__ = [
|
|
@@ -26,4 +31,9 @@ __all__ = [
|
|
|
26
31
|
"AbsoluteDateDifferenceAtThresholds",
|
|
27
32
|
"ArrayIntersectAtSizes",
|
|
28
33
|
"DistanceInKMAtThresholds",
|
|
34
|
+
"DateOfBirthComparison",
|
|
35
|
+
"EmailComparison",
|
|
36
|
+
"ForenameSurnameComparison",
|
|
37
|
+
"NameComparison",
|
|
38
|
+
"PostcodeComparison",
|
|
29
39
|
]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from copy import deepcopy
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
5
|
|
|
6
6
|
from splink.internals.block_from_labels import block_from_labels
|
|
7
7
|
from splink.internals.blocking import BlockingRule
|
|
@@ -307,8 +307,11 @@ def _select_found_by_blocking_rules(linker: "Linker") -> str:
|
|
|
307
307
|
|
|
308
308
|
|
|
309
309
|
def truth_space_table_from_labels_table(
|
|
310
|
-
linker
|
|
311
|
-
|
|
310
|
+
linker: Linker,
|
|
311
|
+
labels_tablename: str,
|
|
312
|
+
threshold_actual: float = 0.5,
|
|
313
|
+
match_weight_round_to_nearest: Optional[float] = None,
|
|
314
|
+
) -> SplinkDataFrame:
|
|
312
315
|
pipeline = CTEPipeline()
|
|
313
316
|
|
|
314
317
|
nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
|
|
@@ -323,7 +326,7 @@ def truth_space_table_from_labels_table(
|
|
|
323
326
|
)
|
|
324
327
|
pipeline.enqueue_list_of_sqls(sqls)
|
|
325
328
|
|
|
326
|
-
df_truth_space_table = linker.
|
|
329
|
+
df_truth_space_table = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
327
330
|
|
|
328
331
|
return df_truth_space_table
|
|
329
332
|
|
|
@@ -356,7 +359,7 @@ def truth_space_table_from_labels_column(
|
|
|
356
359
|
"""
|
|
357
360
|
|
|
358
361
|
pipeline.enqueue_sql(sql, "__splink__cartesian_product")
|
|
359
|
-
cartesian_count = linker.
|
|
362
|
+
cartesian_count = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
360
363
|
row_count_df = cartesian_count.as_record_dict()
|
|
361
364
|
cartesian_count.drop_table_from_database_and_remove_from_cache()
|
|
362
365
|
|
|
@@ -393,7 +396,7 @@ def truth_space_table_from_labels_column(
|
|
|
393
396
|
)
|
|
394
397
|
pipeline.enqueue_list_of_sqls(sqls)
|
|
395
398
|
|
|
396
|
-
df_truth_space_table = linker.
|
|
399
|
+
df_truth_space_table = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
397
400
|
|
|
398
401
|
return df_truth_space_table
|
|
399
402
|
|
|
@@ -439,12 +442,12 @@ def predictions_from_sample_of_pairwise_labels_sql(linker, labels_tablename):
|
|
|
439
442
|
|
|
440
443
|
|
|
441
444
|
def prediction_errors_from_labels_table(
|
|
442
|
-
linker,
|
|
443
|
-
labels_tablename,
|
|
444
|
-
include_false_positives=True,
|
|
445
|
-
include_false_negatives=True,
|
|
446
|
-
threshold=0.5,
|
|
447
|
-
):
|
|
445
|
+
linker: Linker,
|
|
446
|
+
labels_tablename: str,
|
|
447
|
+
include_false_positives: bool = True,
|
|
448
|
+
include_false_negatives: bool = True,
|
|
449
|
+
threshold: float = 0.5,
|
|
450
|
+
) -> SplinkDataFrame:
|
|
448
451
|
pipeline = CTEPipeline()
|
|
449
452
|
nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
|
|
450
453
|
pipeline = CTEPipeline([nodes_with_tf])
|
|
@@ -486,7 +489,7 @@ def prediction_errors_from_labels_table(
|
|
|
486
489
|
|
|
487
490
|
pipeline.enqueue_sql(sql, "__splink__labels_with_fp_fn_status")
|
|
488
491
|
|
|
489
|
-
return linker.
|
|
492
|
+
return linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
490
493
|
|
|
491
494
|
|
|
492
495
|
def _predict_from_label_column_sql(linker, label_colname):
|
|
@@ -509,18 +512,18 @@ def _predict_from_label_column_sql(linker, label_colname):
|
|
|
509
512
|
settings._additional_column_names_to_retain.append(label_colname)
|
|
510
513
|
|
|
511
514
|
# Now we want to create predictions
|
|
512
|
-
df_predict = linker.predict()
|
|
515
|
+
df_predict = linker.inference.predict()
|
|
513
516
|
|
|
514
517
|
return df_predict
|
|
515
518
|
|
|
516
519
|
|
|
517
520
|
def prediction_errors_from_label_column(
|
|
518
|
-
linker,
|
|
519
|
-
label_colname,
|
|
520
|
-
include_false_positives=True,
|
|
521
|
-
include_false_negatives=True,
|
|
522
|
-
threshold=0.5,
|
|
523
|
-
):
|
|
521
|
+
linker: Linker,
|
|
522
|
+
label_colname: str,
|
|
523
|
+
include_false_positives: bool = True,
|
|
524
|
+
include_false_negatives: bool = True,
|
|
525
|
+
threshold: float = 0.5,
|
|
526
|
+
) -> SplinkDataFrame:
|
|
524
527
|
df_predict = _predict_from_label_column_sql(
|
|
525
528
|
linker,
|
|
526
529
|
label_colname,
|
|
@@ -577,6 +580,6 @@ def prediction_errors_from_label_column(
|
|
|
577
580
|
|
|
578
581
|
pipeline.enqueue_sql(sql, "__splink__predictions_from_label_column_fp_fn_only")
|
|
579
582
|
|
|
580
|
-
predictions = linker.
|
|
583
|
+
predictions = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
581
584
|
|
|
582
585
|
return predictions
|