splink 4.0.0.dev2__tar.gz → 4.0.0.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/PKG-INFO +38 -35
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/README.md +25 -24
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/pyproject.toml +53 -49
- splink-4.0.0.dev4/splink/__init__.py +60 -0
- splink-4.0.0.dev4/splink/accuracy.py +580 -0
- splink-4.0.0.dev4/splink/athena/linker.py +563 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/block_from_labels.py +11 -9
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/blocking.py +240 -172
- splink-4.0.0.dev4/splink/blocking_analysis.py +11 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/blocking_rule_creator.py +12 -3
- splink-4.0.0.dev4/splink/blocking_rule_creator_utils.py +39 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/blocking_rule_library.py +21 -14
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/cache_dict_with_logging.py +8 -2
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/charts.py +70 -25
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/cluster_studio.py +147 -92
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/column_expression.py +51 -15
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison.py +116 -133
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_creator.py +16 -13
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level.py +132 -138
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level_composition.py +8 -4
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level_creator.py +14 -10
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level_library.py +105 -48
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_library.py +98 -45
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_template_library.py +121 -87
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_vector_distribution.py +1 -1
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_vector_values.py +5 -7
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/connected_components.py +69 -59
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/cost_of_blocking_rules.py +6 -4
- splink-4.0.0.dev4/splink/database_api.py +364 -0
- splink-4.0.0.dev4/splink/databricks/enable_splink.py +71 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/datasets/__init__.py +7 -5
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/dialects.py +102 -78
- splink-4.0.0.dev4/splink/duckdb/database_api.py +119 -0
- splink-4.0.0.dev4/splink/duckdb/dataframe.py +87 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/duckdb/duckdb_helpers/duckdb_helpers.py +1 -1
- splink-4.0.0.dev4/splink/edge_metrics.py +142 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/em_training_session.py +157 -185
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/estimate_u.py +80 -39
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/exceptions.py +41 -26
- splink-4.0.0.dev4/splink/expectation_maximisation.py +426 -0
- splink-4.0.0.dev4/splink/exploratory.py +4 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/accuracy_chart.json +1 -1
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/blocking_rule_generated_comparisons.json +4 -15
- splink-4.0.0.dev4/splink/files/chart_defs/threshold_selection_tool.json +818 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/settings_jsonschema.json +6 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/find_brs_with_comparison_counts_below_threshold.py +44 -38
- splink-4.0.0.dev4/splink/find_matches_to_new_records.py +42 -0
- splink-4.0.0.dev4/splink/graph_metrics.py +314 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/input_column.py +38 -38
- splink-4.0.0.dev4/splink/internals/blocking_analysis.py +656 -0
- splink-4.0.0.dev4/splink/internals/completeness.py +126 -0
- {splink-4.0.0.dev2/splink → splink-4.0.0.dev4/splink/internals}/profile_data.py +63 -31
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/labelling_tool.py +22 -13
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/linker.py +699 -1353
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/lower_id_on_lhs.py +7 -4
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/m_from_labels.py +22 -11
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/m_training.py +33 -16
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/m_u_records_to_parameters.py +28 -18
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/match_weights_histogram.py +15 -9
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/misc.py +11 -23
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/optimise_cost_of_brs.py +6 -1
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/parse_sql.py +7 -3
- splink-4.0.0.dev4/splink/pipeline.py +121 -0
- splink-4.0.0.dev4/splink/postgres/database_api.py +191 -0
- splink-4.0.0.dev2/splink/postgres/linker.py → splink-4.0.0.dev4/splink/postgres/dataframe.py +7 -25
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/predict.py +81 -29
- splink-4.0.0.dev4/splink/settings.py +688 -0
- splink-4.0.0.dev4/splink/settings_creator.py +142 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/settings_validation/log_invalid_columns.py +34 -23
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/settings_validation/settings_column_cleaner.py +43 -19
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/settings_validation/settings_validation_log_strings.py +78 -7
- splink-4.0.0.dev4/splink/settings_validation/valid_types.py +26 -0
- splink-4.0.0.dev4/splink/spark/database_api.py +326 -0
- splink-4.0.0.dev4/splink/spark/dataframe.py +71 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/spark/spark_helpers/custom_spark_dialect.py +2 -2
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/splink_comparison_viewer.py +21 -19
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/splink_dataframe.py +19 -10
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/sql_transform.py +18 -9
- splink-4.0.0.dev4/splink/sqlite/database_api.py +103 -0
- splink-4.0.0.dev2/splink/sqlite/linker.py → splink-4.0.0.dev4/splink/sqlite/dataframe.py +5 -38
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/term_frequencies.py +92 -76
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/unique_id_concat.py +7 -1
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/unlinkables.py +12 -8
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/validate_jsonschema.py +4 -1
- splink-4.0.0.dev4/splink/vertically_concatenate.py +217 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/waterfall_chart.py +45 -27
- splink-4.0.0.dev2/splink/__init__.py +0 -1
- splink-4.0.0.dev2/splink/accuracy.py +0 -399
- splink-4.0.0.dev2/splink/analyse_blocking.py +0 -249
- splink-4.0.0.dev2/splink/athena/athena_linker.py +0 -12
- splink-4.0.0.dev2/splink/athena/linker.py +0 -561
- splink-4.0.0.dev2/splink/cluster_metrics.py +0 -139
- splink-4.0.0.dev2/splink/comparison_helpers_utils.py +0 -22
- splink-4.0.0.dev2/splink/comparison_library_utils.py +0 -138
- splink-4.0.0.dev2/splink/convert_v2_to_v3.py +0 -198
- splink-4.0.0.dev2/splink/database_api.py +0 -925
- splink-4.0.0.dev2/splink/databricks/enable_splink.py +0 -36
- splink-4.0.0.dev2/splink/dialect_base.py +0 -59
- splink-4.0.0.dev2/splink/duckdb/duckdb_linker.py +0 -12
- splink-4.0.0.dev2/splink/duckdb/linker.py +0 -160
- splink-4.0.0.dev2/splink/expectation_maximisation.py +0 -272
- splink-4.0.0.dev2/splink/exploratory_analysis.py +0 -0
- splink-4.0.0.dev2/splink/files/chart_defs/confusion_matrix.json +0 -364
- splink-4.0.0.dev2/splink/find_matches_to_new_records.py +0 -36
- splink-4.0.0.dev2/splink/format_sql.py +0 -7
- splink-4.0.0.dev2/splink/missingness.py +0 -97
- splink-4.0.0.dev2/splink/pipeline.py +0 -95
- splink-4.0.0.dev2/splink/postgres/postgres_linker.py +0 -12
- splink-4.0.0.dev2/splink/settings.py +0 -518
- splink-4.0.0.dev2/splink/settings_validation/valid_types.py +0 -196
- splink-4.0.0.dev2/splink/spark/linker.py +0 -123
- splink-4.0.0.dev2/splink/spark/spark_linker.py +0 -12
- splink-4.0.0.dev2/splink/splink_architecture.md +0 -79
- splink-4.0.0.dev2/splink/sqlite/sqlite_linker.py +0 -12
- splink-4.0.0.dev2/splink/vertically_concatenate.py +0 -81
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/LICENSE +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/athena/athena_helpers/athena_transforms.py +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/athena/athena_helpers/athena_utils.py +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_helpers.py +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level_sql.py +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/constants.py +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/default_from_jsonschema.py +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/DEPENDENCY_LICENSES.txt +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/comparator_score_chart.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/comparator_score_threshold_chart.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/completeness.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/m_u_parameters_interactive_history.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/match_weight_histogram.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/match_weights_interactive_history.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/match_weights_waterfall.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/missingness.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/parameter_estimate_comparisons.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/phonetic_match_chart.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/precision_recall.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/probability_two_random_records_match_iteration.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/profile_data.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/roc.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/tf_adjustment_chart.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/unlinkables_chart_def.json +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/d3@7.8.5 +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/stdlib.js@5.8.3 +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/vega-embed@6.20.2 +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/vega-lite@5.2.0 +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/vega@5.21.0 +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/labelling_tool/slt.js +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/labelling_tool/template.j2 +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/spark_jars/scala-udf-similarity-0.1.0_classic.jar +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/spark_jars/scala-udf-similarity-0.1.0_spark3.3.jar +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/spark_jars/scala-udf-similarity-0.1.1_spark3.x.jar +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_cluster_studio/cluster_template.j2 +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_cluster_studio/custom.css +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_comparison_viewer/custom.css +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_comparison_viewer/template.j2 +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_vis_utils/splink_vis_utils.js +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/templates/single_chart_template.html +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/logging_messages.py +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/match_key_analysis.py +0 -0
- {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/spark/jar_location.py +0 -0
|
@@ -1,34 +1,36 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: splink
|
|
3
|
-
Version: 4.0.0.
|
|
3
|
+
Version: 4.0.0.dev4
|
|
4
4
|
Summary: Fast probabilistic data linkage at scale
|
|
5
5
|
Home-page: https://github.com/moj-analytical-services/splink
|
|
6
6
|
License: MIT
|
|
7
7
|
Author: Robin Linacre
|
|
8
8
|
Author-email: robinlinacre@hotmail.com
|
|
9
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.8.0,<4.0.0
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.8
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
17
|
Provides-Extra: athena
|
|
17
18
|
Provides-Extra: postgres
|
|
18
19
|
Provides-Extra: pyspark
|
|
19
20
|
Provides-Extra: spark
|
|
20
21
|
Requires-Dist: Jinja2 (>=3.0.3)
|
|
21
22
|
Requires-Dist: altair (>=5.0.1,<6.0.0)
|
|
22
|
-
Requires-Dist: awswrangler (
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist: jsonschema (>=3.2
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
23
|
+
Requires-Dist: awswrangler (>=3.0.0,<4.0.0) ; (python_version >= "3.8") and (extra == "athena")
|
|
24
|
+
Requires-Dist: duckdb (>=0.9.2)
|
|
25
|
+
Requires-Dist: igraph (>=0.11.2) ; python_version >= "3.8"
|
|
26
|
+
Requires-Dist: jsonschema (>=3.2)
|
|
27
|
+
Requires-Dist: numpy (>=1.17.3) ; python_version < "3.12"
|
|
28
|
+
Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
|
|
29
|
+
Requires-Dist: pandas (>1.3.5)
|
|
30
|
+
Requires-Dist: phonetics (>=1.0.5)
|
|
28
31
|
Requires-Dist: psycopg2-binary (>=2.8.0) ; extra == "postgres"
|
|
29
|
-
Requires-Dist: pyspark (>=3.2.1
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist: sqlglot (>=13.0.0,<19.0.0)
|
|
32
|
+
Requires-Dist: pyspark (>=3.2.1) ; extra == "pyspark" or extra == "spark"
|
|
33
|
+
Requires-Dist: sqlglot (>=13.0.0)
|
|
32
34
|
Project-URL: Repository, https://github.com/moj-analytical-services/splink
|
|
33
35
|
Description-Content-Type: text/markdown
|
|
34
36
|
|
|
@@ -40,7 +42,8 @@ Description-Content-Type: text/markdown
|
|
|
40
42
|
[](https://pepy.tech/project/splink)
|
|
41
43
|
[](https://moj-analytical-services.github.io/splink/)
|
|
42
44
|
|
|
43
|
-
|
|
45
|
+
> [!IMPORTANT]
|
|
46
|
+
> Development has begun on Splink 4 on the `splink4_dev` branch. Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/03/19/splink4.html).
|
|
44
47
|
|
|
45
48
|
# Fast, accurate and scalable probabilistic data linkage
|
|
46
49
|
|
|
@@ -54,7 +57,7 @@ Splink is a Python package for probabilistic record linkage (entity resolution)
|
|
|
54
57
|
🎓 **Unsupervised Learning:** No training data is required for model training.
|
|
55
58
|
📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.
|
|
56
59
|
|
|
57
|
-
Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various
|
|
60
|
+
Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customisations to improve accuracy.
|
|
58
61
|
|
|
59
62
|
## What does Splink do?
|
|
60
63
|
|
|
@@ -72,7 +75,7 @@ and clusters these links to produce an estimated person ID:
|
|
|
72
75
|
|
|
73
76
|
## What data does Splink work best with?
|
|
74
77
|
|
|
75
|
-
Before using Splink, input data should be
|
|
78
|
+
Before using Splink, input data should be standardised, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
|
|
76
79
|
|
|
77
80
|
Splink performs best with input data containing **multiple** columns that are **not highly correlated**. For instance, if the entity type is persons, you may have columns for full name, date of birth, and city. If the entity type is companies, you could have columns for name, turnover, sector, and telephone number.
|
|
78
81
|
|
|
@@ -104,39 +107,33 @@ or, if you prefer, you can instead install splink using conda:
|
|
|
104
107
|
conda install -c conda-forge splink
|
|
105
108
|
```
|
|
106
109
|
|
|
107
|
-
|
|
108
|
-
<summary><h3>Additional installation methods</h3></summary>
|
|
110
|
+
### Installing Splink for Specific Backends
|
|
109
111
|
|
|
110
|
-
<br>
|
|
111
112
|
|
|
112
|
-
|
|
113
|
-
|
|
113
|
+
For projects requiring specific backends, Splink offers optional installations for **Spark**, **Athena**, and **PostgreSQL**. These can be installed by appending the backend name in brackets to the pip install command:
|
|
114
|
+
```sh
|
|
115
|
+
pip install 'splink[{backend}]'
|
|
116
|
+
```
|
|
114
117
|
|
|
115
|
-
|
|
118
|
+
Should you require a version of Splink without **DuckDB**, see our section on [DuckDBLess Splink Installation](https://moj-analytical-services.github.io/splink/installations.html#duckdb-less-installation).
|
|
116
119
|
|
|
117
|
-
|
|
120
|
+
<details>
|
|
121
|
+
<summary><i>Click here for backend-specific installation commands</i></summary>
|
|
118
122
|
|
|
119
|
-
|
|
123
|
+
#### Spark
|
|
120
124
|
```sh
|
|
121
125
|
pip install 'splink[spark]'
|
|
122
126
|
```
|
|
123
127
|
|
|
124
|
-
|
|
128
|
+
#### Athena
|
|
125
129
|
```sh
|
|
126
130
|
pip install 'splink[athena]'
|
|
127
131
|
```
|
|
128
132
|
|
|
129
|
-
|
|
133
|
+
#### PostgreSQL
|
|
130
134
|
```sh
|
|
131
135
|
pip install 'splink[postgres]'
|
|
132
136
|
```
|
|
133
|
-
|
|
134
|
-
<br>
|
|
135
|
-
|
|
136
|
-
### DuckDBLess Splink
|
|
137
|
-
Should you require a more bare-bones version of Splink **without DuckDB**, please see the following area of the docs:
|
|
138
|
-
> [DuckDBless Splink Installation](https://moj-analytical-services.github.io/splink/installations.html#duckdb-less-installation)
|
|
139
|
-
|
|
140
137
|
</details>
|
|
141
138
|
|
|
142
139
|
## Quickstart
|
|
@@ -192,23 +189,29 @@ clusters.as_pandas_dataframe(limit=5)
|
|
|
192
189
|
|
|
193
190
|
## Charts Gallery
|
|
194
191
|
|
|
195
|
-
You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](
|
|
192
|
+
You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](https://moj-analytical-services.github.io/splink/charts/index.html).
|
|
196
193
|
|
|
197
194
|
## Support
|
|
198
195
|
|
|
199
196
|
To find the best place to ask a question, report a bug or get general advice, please refer to our [Contributing Guide](./CONTRIBUTING.md).
|
|
200
197
|
|
|
198
|
+
## Use Cases
|
|
199
|
+
|
|
200
|
+
To see how users are using Splink in the wild, check out the [Use Cases](https://moj-analytical-services.github.io/splink/#use-cases) section of the docs.
|
|
201
|
+
|
|
201
202
|
## Awards
|
|
202
203
|
|
|
203
|
-
|
|
204
|
+
❓ Future of Government Awards 2023: Open Source Creation - [Shortlisted, result to be announced shortly](https://futureofgovernment.com/en)
|
|
204
205
|
|
|
205
|
-
|
|
206
|
+
🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
|
|
206
207
|
|
|
207
208
|
🥇 Analysis in Government Awards 2022: People's Choice Award - [Winner](https://analysisfunction.civilservice.gov.uk/news/announcing-the-winner-of-the-first-analysis-in-government-peoples-choice-award/)
|
|
208
209
|
|
|
209
210
|
🥈 Analysis in Government Awards 2022: Innovative Methods - [Runner up](https://twitter.com/gov_analysis/status/1616073633692274689?s=20&t=6TQyNLJRjnhsfJy28Zd6UQ)
|
|
210
211
|
|
|
211
|
-
|
|
212
|
+
🥇 Analysis in Government Awards 2020: Innovative Methods - [Winner](https://www.gov.uk/government/news/launch-of-the-analysis-in-government-awards)
|
|
213
|
+
|
|
214
|
+
🥇 MoJ Data and Analytical Services Directorate (DASD) Awards 2020: Innovation and Impact - Winner
|
|
212
215
|
|
|
213
216
|
|
|
214
217
|
## Citation
|
|
@@ -6,7 +6,8 @@
|
|
|
6
6
|
[](https://pepy.tech/project/splink)
|
|
7
7
|
[](https://moj-analytical-services.github.io/splink/)
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
> [!IMPORTANT]
|
|
10
|
+
> Development has begun on Splink 4 on the `splink4_dev` branch. Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/03/19/splink4.html).
|
|
10
11
|
|
|
11
12
|
# Fast, accurate and scalable probabilistic data linkage
|
|
12
13
|
|
|
@@ -20,7 +21,7 @@ Splink is a Python package for probabilistic record linkage (entity resolution)
|
|
|
20
21
|
🎓 **Unsupervised Learning:** No training data is required for model training.
|
|
21
22
|
📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.
|
|
22
23
|
|
|
23
|
-
Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various
|
|
24
|
+
Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customisations to improve accuracy.
|
|
24
25
|
|
|
25
26
|
## What does Splink do?
|
|
26
27
|
|
|
@@ -38,7 +39,7 @@ and clusters these links to produce an estimated person ID:
|
|
|
38
39
|
|
|
39
40
|
## What data does Splink work best with?
|
|
40
41
|
|
|
41
|
-
Before using Splink, input data should be
|
|
42
|
+
Before using Splink, input data should be standardised, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
|
|
42
43
|
|
|
43
44
|
Splink performs best with input data containing **multiple** columns that are **not highly correlated**. For instance, if the entity type is persons, you may have columns for full name, date of birth, and city. If the entity type is companies, you could have columns for name, turnover, sector, and telephone number.
|
|
44
45
|
|
|
@@ -70,39 +71,33 @@ or, if you prefer, you can instead install splink using conda:
|
|
|
70
71
|
conda install -c conda-forge splink
|
|
71
72
|
```
|
|
72
73
|
|
|
73
|
-
|
|
74
|
-
<summary><h3>Additional installation methods</h3></summary>
|
|
74
|
+
### Installing Splink for Specific Backends
|
|
75
75
|
|
|
76
|
-
<br>
|
|
77
76
|
|
|
78
|
-
|
|
79
|
-
|
|
77
|
+
For projects requiring specific backends, Splink offers optional installations for **Spark**, **Athena**, and **PostgreSQL**. These can be installed by appending the backend name in brackets to the pip install command:
|
|
78
|
+
```sh
|
|
79
|
+
pip install 'splink[{backend}]'
|
|
80
|
+
```
|
|
80
81
|
|
|
81
|
-
|
|
82
|
+
Should you require a version of Splink without **DuckDB**, see our section on [DuckDBLess Splink Installation](https://moj-analytical-services.github.io/splink/installations.html#duckdb-less-installation).
|
|
82
83
|
|
|
83
|
-
|
|
84
|
+
<details>
|
|
85
|
+
<summary><i>Click here for backend-specific installation commands</i></summary>
|
|
84
86
|
|
|
85
|
-
|
|
87
|
+
#### Spark
|
|
86
88
|
```sh
|
|
87
89
|
pip install 'splink[spark]'
|
|
88
90
|
```
|
|
89
91
|
|
|
90
|
-
|
|
92
|
+
#### Athena
|
|
91
93
|
```sh
|
|
92
94
|
pip install 'splink[athena]'
|
|
93
95
|
```
|
|
94
96
|
|
|
95
|
-
|
|
97
|
+
#### PostgreSQL
|
|
96
98
|
```sh
|
|
97
99
|
pip install 'splink[postgres]'
|
|
98
100
|
```
|
|
99
|
-
|
|
100
|
-
<br>
|
|
101
|
-
|
|
102
|
-
### DuckDBLess Splink
|
|
103
|
-
Should you require a more bare-bones version of Splink **without DuckDB**, please see the following area of the docs:
|
|
104
|
-
> [DuckDBless Splink Installation](https://moj-analytical-services.github.io/splink/installations.html#duckdb-less-installation)
|
|
105
|
-
|
|
106
101
|
</details>
|
|
107
102
|
|
|
108
103
|
## Quickstart
|
|
@@ -158,23 +153,29 @@ clusters.as_pandas_dataframe(limit=5)
|
|
|
158
153
|
|
|
159
154
|
## Charts Gallery
|
|
160
155
|
|
|
161
|
-
You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](
|
|
156
|
+
You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](https://moj-analytical-services.github.io/splink/charts/index.html).
|
|
162
157
|
|
|
163
158
|
## Support
|
|
164
159
|
|
|
165
160
|
To find the best place to ask a question, report a bug or get general advice, please refer to our [Contributing Guide](./CONTRIBUTING.md).
|
|
166
161
|
|
|
162
|
+
## Use Cases
|
|
163
|
+
|
|
164
|
+
To see how users are using Splink in the wild, check out the [Use Cases](https://moj-analytical-services.github.io/splink/#use-cases) section of the docs.
|
|
165
|
+
|
|
167
166
|
## Awards
|
|
168
167
|
|
|
169
|
-
|
|
168
|
+
❓ Future of Government Awards 2023: Open Source Creation - [Shortlisted, result to be announced shortly](https://futureofgovernment.com/en)
|
|
170
169
|
|
|
171
|
-
|
|
170
|
+
🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
|
|
172
171
|
|
|
173
172
|
🥇 Analysis in Government Awards 2022: People's Choice Award - [Winner](https://analysisfunction.civilservice.gov.uk/news/announcing-the-winner-of-the-first-analysis-in-government-peoples-choice-award/)
|
|
174
173
|
|
|
175
174
|
🥈 Analysis in Government Awards 2022: Innovative Methods - [Runner up](https://twitter.com/gov_analysis/status/1616073633692274689?s=20&t=6TQyNLJRjnhsfJy28Zd6UQ)
|
|
176
175
|
|
|
177
|
-
|
|
176
|
+
🥇 Analysis in Government Awards 2020: Innovative Methods - [Winner](https://www.gov.uk/government/news/launch-of-the-analysis-in-government-awards)
|
|
177
|
+
|
|
178
|
+
🥇 MoJ Data and Analytical Services Directorate (DASD) Awards 2020: Innovation and Impact - Winner
|
|
178
179
|
|
|
179
180
|
|
|
180
181
|
## Citation
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "splink"
|
|
3
|
-
version = "4.0.0.
|
|
3
|
+
version = "4.0.0.dev4"
|
|
4
4
|
description = "Fast probabilistic data linkage at scale"
|
|
5
5
|
authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"]
|
|
6
6
|
license = "MIT"
|
|
@@ -9,70 +9,63 @@ repository = "https://github.com/moj-analytical-services/splink"
|
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
|
|
11
11
|
[tool.poetry.dependencies]
|
|
12
|
-
python = ">=3.
|
|
13
|
-
jsonschema = ">=3.2
|
|
12
|
+
python = ">=3.8.0,<4.0.0"
|
|
13
|
+
jsonschema = ">=3.2"
|
|
14
14
|
# 1.3.5 is the last version supporting py 3.7.1
|
|
15
|
-
pandas = ">1.3.
|
|
16
|
-
duckdb = ">=0.
|
|
17
|
-
sqlglot = ">=13.0.0
|
|
15
|
+
pandas = ">1.3.5"
|
|
16
|
+
duckdb = ">=0.9.2"
|
|
17
|
+
sqlglot = ">=13.0.0"
|
|
18
18
|
altair = "^5.0.1"
|
|
19
19
|
Jinja2 = ">=3.0.3"
|
|
20
|
-
phonetics = "
|
|
20
|
+
phonetics = ">=1.0.5"
|
|
21
|
+
|
|
22
|
+
# need to manually specify numpy versions suitable for CI
|
|
23
|
+
# 1.24.4 works with python 3.8, but not 3.12
|
|
24
|
+
numpy = [
|
|
25
|
+
# version is minimum valid with above listed pandas version
|
|
26
|
+
{version=">=1.17.3", python = "<3.12"},
|
|
27
|
+
{version=">=1.26.0", python = ">=3.12"},
|
|
28
|
+
]
|
|
29
|
+
|
|
21
30
|
|
|
22
31
|
# Optional installs
|
|
23
|
-
|
|
32
|
+
# python >=3.12 requires pyspark >=4.0.0 - currently unreleased
|
|
33
|
+
pyspark = {version=">=3.2.1", optional=true}
|
|
24
34
|
|
|
25
35
|
awswrangler = [
|
|
26
|
-
{version
|
|
27
|
-
{version=">=3.0.0,<4.0.0", python = "^3.8", optional=true}
|
|
36
|
+
{version=">=3.0.0,<4.0.0", python = ">=3.8", optional=true}
|
|
28
37
|
]
|
|
29
38
|
|
|
30
|
-
# sqlalchemy >= 2.0.0 not working well with older pandas
|
|
31
|
-
sqlalchemy = {version=">=1.4.0,<2.0.0", optional=true}
|
|
32
39
|
psycopg2-binary = {version=">=2.8.0", optional=true}
|
|
33
40
|
|
|
41
|
+
# for graph metrics
|
|
42
|
+
igraph = { version = ">=0.11.2", python = ">=3.8", optional=true }
|
|
43
|
+
|
|
34
44
|
[tool.poetry.group.dev]
|
|
35
45
|
[tool.poetry.group.dev.dependencies]
|
|
36
|
-
tabulate = "0.8.9"
|
|
37
|
-
pyspark = "
|
|
38
|
-
|
|
39
|
-
sqlalchemy = ">=1.4.0,<2.0.0"
|
|
46
|
+
tabulate = ">=0.8.9"
|
|
47
|
+
pyspark = ">=3.2.1"
|
|
48
|
+
sqlalchemy = ">=1.4.0"
|
|
40
49
|
# temporarily use binary version, to avoid issues with pg_config path
|
|
41
50
|
psycopg2-binary = ">=2.8.0"
|
|
51
|
+
igraph = ">=0.11.2"
|
|
42
52
|
|
|
43
53
|
[tool.poetry.group.linting]
|
|
44
54
|
[tool.poetry.group.linting.dependencies]
|
|
45
|
-
|
|
46
|
-
ruff = "0.0.257"
|
|
55
|
+
ruff = "^0.4.2"
|
|
47
56
|
|
|
48
57
|
[tool.poetry.group.testing]
|
|
49
58
|
[tool.poetry.group.testing.dependencies]
|
|
50
59
|
# pin to reduce dependencies
|
|
51
|
-
pytest = "7.3"
|
|
60
|
+
pytest = ">=7.3"
|
|
52
61
|
pyarrow = ">=7.0.0"
|
|
53
|
-
networkx = "2.5.1"
|
|
54
|
-
rapidfuzz = "
|
|
55
|
-
|
|
56
|
-
[tool.poetry.group.benchmarking]
|
|
57
|
-
optional = true
|
|
58
|
-
[tool.poetry.group.benchmarking.dependencies]
|
|
59
|
-
pytest-benchmark = "^4"
|
|
60
|
-
lzstring = "1.0.4"
|
|
62
|
+
networkx = ">=2.5.1"
|
|
63
|
+
rapidfuzz = ">=2.0.3"
|
|
61
64
|
|
|
62
65
|
[tool.poetry.group.typechecking]
|
|
63
66
|
optional = true
|
|
64
67
|
[tool.poetry.group.typechecking.dependencies]
|
|
65
|
-
mypy = "1.
|
|
66
|
-
|
|
67
|
-
[tool.poetry.group.demos]
|
|
68
|
-
[tool.poetry.group.demos.dependencies]
|
|
69
|
-
importlib-resources = "5.4.0"
|
|
70
|
-
jupyterlab = "3.6.1"
|
|
71
|
-
pyarrow = ">=7.0.0"
|
|
72
|
-
ipywidgets = "8.0.4"
|
|
73
|
-
nbmake = "1.3.4"
|
|
74
|
-
pytest = "^7.0"
|
|
75
|
-
pyspark = "^3.2.1"
|
|
68
|
+
mypy = "1.9.0"
|
|
76
69
|
|
|
77
70
|
[tool.poetry.extras]
|
|
78
71
|
pyspark = ["pyspark"]
|
|
@@ -89,7 +82,7 @@ profile = "black"
|
|
|
89
82
|
|
|
90
83
|
[tool.ruff]
|
|
91
84
|
line-length = 88
|
|
92
|
-
select = [
|
|
85
|
+
lint.select = [
|
|
93
86
|
# Pyflakes
|
|
94
87
|
"F",
|
|
95
88
|
# Pycodestyle
|
|
@@ -102,7 +95,7 @@ select = [
|
|
|
102
95
|
# flake8-print
|
|
103
96
|
"T20"
|
|
104
97
|
]
|
|
105
|
-
ignore = [
|
|
98
|
+
lint.ignore = [
|
|
106
99
|
"B905", # `zip()` without an explicit `strict=` parameter
|
|
107
100
|
"B006", # Do not use mutable data structures for argument defaults"
|
|
108
101
|
]
|
|
@@ -122,22 +115,33 @@ markers = [
|
|
|
122
115
|
"spark_only",
|
|
123
116
|
"sqlite",
|
|
124
117
|
"sqlite_only",
|
|
118
|
+
"postgres",
|
|
119
|
+
"postgres_only",
|
|
125
120
|
]
|
|
126
121
|
|
|
127
122
|
[tool.mypy]
|
|
128
123
|
packages = "splink"
|
|
129
|
-
# temporary exclusions
|
|
130
|
-
exclude = [
|
|
131
|
-
# modules getting substantial rewrites:
|
|
132
|
-
'.*comparison_imports\.py$',
|
|
133
|
-
'.*comparison.*library\.py',
|
|
134
|
-
'comparison_level_composition',
|
|
135
|
-
# modules with large number of errors
|
|
136
|
-
'.*linker\.py',
|
|
137
|
-
]
|
|
138
124
|
# for now at least allow implicit optionals
|
|
139
125
|
# to cut down on noise. Easy to fix.
|
|
140
126
|
implicit_optional = true
|
|
141
127
|
# for now, ignore missing imports
|
|
142
128
|
# can remove later and install stubs, where existent
|
|
143
129
|
ignore_missing_imports = true
|
|
130
|
+
|
|
131
|
+
# options for strict mode
|
|
132
|
+
# too much to handle at once, so opt-in a little at a time
|
|
133
|
+
# https://mypy.readthedocs.io/en/stable/existing_code.html#introduce-stricter-options
|
|
134
|
+
warn_unused_configs = true
|
|
135
|
+
warn_redundant_casts = true
|
|
136
|
+
warn_unused_ignores = true
|
|
137
|
+
strict_equality = true
|
|
138
|
+
# don't worry about warning: https://github.com/python/mypy/issues/16189
|
|
139
|
+
strict_concatenate = true
|
|
140
|
+
check_untyped_defs = true
|
|
141
|
+
disallow_subclassing_any = true
|
|
142
|
+
disallow_untyped_decorators = true
|
|
143
|
+
disallow_any_generics = true
|
|
144
|
+
# further strict checks to add in:
|
|
145
|
+
# disallow_untyped_calls = true
|
|
146
|
+
disallow_incomplete_defs = true
|
|
147
|
+
# disallow_untyped_defs = true
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
# Explicitly declare exported names to avoid 'imported but unused' linting issues
|
|
4
|
+
__all__ = [
|
|
5
|
+
"block_on",
|
|
6
|
+
"splink_datasets",
|
|
7
|
+
"Linker",
|
|
8
|
+
"SettingsCreator",
|
|
9
|
+
"SQLiteAPI",
|
|
10
|
+
"SparkAPI",
|
|
11
|
+
"DuckDBAPI",
|
|
12
|
+
"PostgresAPI",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
from splink.blocking_rule_library import block_on
|
|
17
|
+
from splink.datasets import splink_datasets
|
|
18
|
+
from splink.linker import Linker
|
|
19
|
+
from splink.settings_creator import SettingsCreator
|
|
20
|
+
from splink.sqlite.database_api import SQLiteAPI
|
|
21
|
+
|
|
22
|
+
# The following is a workaround for the fact that dependencies of postgres, spark
|
|
23
|
+
# and duckdb may not be installed, but we don't want this to prevent import
|
|
24
|
+
# of the other backends.
|
|
25
|
+
|
|
26
|
+
# This enables auto-complete to be used to import the various DBAPIs
|
|
27
|
+
# and ensures that typing information is retained so e.g. the arguments autocomplete
|
|
28
|
+
# without importing them at runtime
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from splink.duckdb.database_api import DuckDBAPI
|
|
31
|
+
from splink.postgres.database_api import PostgresAPI
|
|
32
|
+
from splink.spark.database_api import SparkAPI
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Use getarr to make the error appear at the point of use
|
|
36
|
+
def __getattr__(name):
|
|
37
|
+
try:
|
|
38
|
+
if name == "SparkAPI":
|
|
39
|
+
from splink.spark.database_api import SparkAPI
|
|
40
|
+
|
|
41
|
+
return SparkAPI
|
|
42
|
+
elif name == "DuckDBAPI":
|
|
43
|
+
from splink.duckdb.database_api import DuckDBAPI
|
|
44
|
+
|
|
45
|
+
return DuckDBAPI
|
|
46
|
+
elif name == "PostgresAPI":
|
|
47
|
+
from splink.postgres.database_api import PostgresAPI
|
|
48
|
+
|
|
49
|
+
return PostgresAPI
|
|
50
|
+
except ImportError as err:
|
|
51
|
+
if name in ["SparkAPI", "DuckDBAPI", "PostgresAPI"]:
|
|
52
|
+
raise ImportError(
|
|
53
|
+
f"{name} cannot be imported because its dependencies are not "
|
|
54
|
+
"installed. Please `pip install` the required package(s) as "
|
|
55
|
+
"specified in the optional dependencies in pyproject.toml"
|
|
56
|
+
) from err
|
|
57
|
+
raise AttributeError(f"module 'splink' has no attribute '{name}'") from None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
__version__ = "4.0.0.dev4"
|