splink 4.0.0.dev2__tar.gz → 4.0.0.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/PKG-INFO +38 -35
  2. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/README.md +25 -24
  3. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/pyproject.toml +53 -49
  4. splink-4.0.0.dev4/splink/__init__.py +60 -0
  5. splink-4.0.0.dev4/splink/accuracy.py +580 -0
  6. splink-4.0.0.dev4/splink/athena/linker.py +563 -0
  7. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/block_from_labels.py +11 -9
  8. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/blocking.py +240 -172
  9. splink-4.0.0.dev4/splink/blocking_analysis.py +11 -0
  10. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/blocking_rule_creator.py +12 -3
  11. splink-4.0.0.dev4/splink/blocking_rule_creator_utils.py +39 -0
  12. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/blocking_rule_library.py +21 -14
  13. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/cache_dict_with_logging.py +8 -2
  14. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/charts.py +70 -25
  15. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/cluster_studio.py +147 -92
  16. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/column_expression.py +51 -15
  17. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison.py +116 -133
  18. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_creator.py +16 -13
  19. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level.py +132 -138
  20. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level_composition.py +8 -4
  21. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level_creator.py +14 -10
  22. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level_library.py +105 -48
  23. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_library.py +98 -45
  24. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_template_library.py +121 -87
  25. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_vector_distribution.py +1 -1
  26. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_vector_values.py +5 -7
  27. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/connected_components.py +69 -59
  28. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/cost_of_blocking_rules.py +6 -4
  29. splink-4.0.0.dev4/splink/database_api.py +364 -0
  30. splink-4.0.0.dev4/splink/databricks/enable_splink.py +71 -0
  31. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/datasets/__init__.py +7 -5
  32. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/dialects.py +102 -78
  33. splink-4.0.0.dev4/splink/duckdb/database_api.py +119 -0
  34. splink-4.0.0.dev4/splink/duckdb/dataframe.py +87 -0
  35. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/duckdb/duckdb_helpers/duckdb_helpers.py +1 -1
  36. splink-4.0.0.dev4/splink/edge_metrics.py +142 -0
  37. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/em_training_session.py +157 -185
  38. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/estimate_u.py +80 -39
  39. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/exceptions.py +41 -26
  40. splink-4.0.0.dev4/splink/expectation_maximisation.py +426 -0
  41. splink-4.0.0.dev4/splink/exploratory.py +4 -0
  42. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/accuracy_chart.json +1 -1
  43. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/blocking_rule_generated_comparisons.json +4 -15
  44. splink-4.0.0.dev4/splink/files/chart_defs/threshold_selection_tool.json +818 -0
  45. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/settings_jsonschema.json +6 -0
  46. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/find_brs_with_comparison_counts_below_threshold.py +44 -38
  47. splink-4.0.0.dev4/splink/find_matches_to_new_records.py +42 -0
  48. splink-4.0.0.dev4/splink/graph_metrics.py +314 -0
  49. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/input_column.py +38 -38
  50. splink-4.0.0.dev4/splink/internals/blocking_analysis.py +656 -0
  51. splink-4.0.0.dev4/splink/internals/completeness.py +126 -0
  52. {splink-4.0.0.dev2/splink → splink-4.0.0.dev4/splink/internals}/profile_data.py +63 -31
  53. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/labelling_tool.py +22 -13
  54. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/linker.py +699 -1353
  55. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/lower_id_on_lhs.py +7 -4
  56. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/m_from_labels.py +22 -11
  57. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/m_training.py +33 -16
  58. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/m_u_records_to_parameters.py +28 -18
  59. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/match_weights_histogram.py +15 -9
  60. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/misc.py +11 -23
  61. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/optimise_cost_of_brs.py +6 -1
  62. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/parse_sql.py +7 -3
  63. splink-4.0.0.dev4/splink/pipeline.py +121 -0
  64. splink-4.0.0.dev4/splink/postgres/database_api.py +191 -0
  65. splink-4.0.0.dev2/splink/postgres/linker.py → splink-4.0.0.dev4/splink/postgres/dataframe.py +7 -25
  66. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/predict.py +81 -29
  67. splink-4.0.0.dev4/splink/settings.py +688 -0
  68. splink-4.0.0.dev4/splink/settings_creator.py +142 -0
  69. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/settings_validation/log_invalid_columns.py +34 -23
  70. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/settings_validation/settings_column_cleaner.py +43 -19
  71. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/settings_validation/settings_validation_log_strings.py +78 -7
  72. splink-4.0.0.dev4/splink/settings_validation/valid_types.py +26 -0
  73. splink-4.0.0.dev4/splink/spark/database_api.py +326 -0
  74. splink-4.0.0.dev4/splink/spark/dataframe.py +71 -0
  75. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/spark/spark_helpers/custom_spark_dialect.py +2 -2
  76. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/splink_comparison_viewer.py +21 -19
  77. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/splink_dataframe.py +19 -10
  78. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/sql_transform.py +18 -9
  79. splink-4.0.0.dev4/splink/sqlite/database_api.py +103 -0
  80. splink-4.0.0.dev2/splink/sqlite/linker.py → splink-4.0.0.dev4/splink/sqlite/dataframe.py +5 -38
  81. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/term_frequencies.py +92 -76
  82. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/unique_id_concat.py +7 -1
  83. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/unlinkables.py +12 -8
  84. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/validate_jsonschema.py +4 -1
  85. splink-4.0.0.dev4/splink/vertically_concatenate.py +217 -0
  86. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/waterfall_chart.py +45 -27
  87. splink-4.0.0.dev2/splink/__init__.py +0 -1
  88. splink-4.0.0.dev2/splink/accuracy.py +0 -399
  89. splink-4.0.0.dev2/splink/analyse_blocking.py +0 -249
  90. splink-4.0.0.dev2/splink/athena/athena_linker.py +0 -12
  91. splink-4.0.0.dev2/splink/athena/linker.py +0 -561
  92. splink-4.0.0.dev2/splink/cluster_metrics.py +0 -139
  93. splink-4.0.0.dev2/splink/comparison_helpers_utils.py +0 -22
  94. splink-4.0.0.dev2/splink/comparison_library_utils.py +0 -138
  95. splink-4.0.0.dev2/splink/convert_v2_to_v3.py +0 -198
  96. splink-4.0.0.dev2/splink/database_api.py +0 -925
  97. splink-4.0.0.dev2/splink/databricks/enable_splink.py +0 -36
  98. splink-4.0.0.dev2/splink/dialect_base.py +0 -59
  99. splink-4.0.0.dev2/splink/duckdb/duckdb_linker.py +0 -12
  100. splink-4.0.0.dev2/splink/duckdb/linker.py +0 -160
  101. splink-4.0.0.dev2/splink/expectation_maximisation.py +0 -272
  102. splink-4.0.0.dev2/splink/exploratory_analysis.py +0 -0
  103. splink-4.0.0.dev2/splink/files/chart_defs/confusion_matrix.json +0 -364
  104. splink-4.0.0.dev2/splink/find_matches_to_new_records.py +0 -36
  105. splink-4.0.0.dev2/splink/format_sql.py +0 -7
  106. splink-4.0.0.dev2/splink/missingness.py +0 -97
  107. splink-4.0.0.dev2/splink/pipeline.py +0 -95
  108. splink-4.0.0.dev2/splink/postgres/postgres_linker.py +0 -12
  109. splink-4.0.0.dev2/splink/settings.py +0 -518
  110. splink-4.0.0.dev2/splink/settings_validation/valid_types.py +0 -196
  111. splink-4.0.0.dev2/splink/spark/linker.py +0 -123
  112. splink-4.0.0.dev2/splink/spark/spark_linker.py +0 -12
  113. splink-4.0.0.dev2/splink/splink_architecture.md +0 -79
  114. splink-4.0.0.dev2/splink/sqlite/sqlite_linker.py +0 -12
  115. splink-4.0.0.dev2/splink/vertically_concatenate.py +0 -81
  116. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/LICENSE +0 -0
  117. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/athena/athena_helpers/athena_transforms.py +0 -0
  118. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/athena/athena_helpers/athena_utils.py +0 -0
  119. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_helpers.py +0 -0
  120. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/comparison_level_sql.py +0 -0
  121. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/constants.py +0 -0
  122. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/default_from_jsonschema.py +0 -0
  123. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/DEPENDENCY_LICENSES.txt +0 -0
  124. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/comparator_score_chart.json +0 -0
  125. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/comparator_score_threshold_chart.json +0 -0
  126. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/completeness.json +0 -0
  127. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/m_u_parameters_interactive_history.json +0 -0
  128. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/match_weight_histogram.json +0 -0
  129. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/match_weights_interactive_history.json +0 -0
  130. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/match_weights_waterfall.json +0 -0
  131. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/missingness.json +0 -0
  132. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/parameter_estimate_comparisons.json +0 -0
  133. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/phonetic_match_chart.json +0 -0
  134. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/precision_recall.json +0 -0
  135. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/probability_two_random_records_match_iteration.json +0 -0
  136. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/profile_data.json +0 -0
  137. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/roc.json +0 -0
  138. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/tf_adjustment_chart.json +0 -0
  139. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/chart_defs/unlinkables_chart_def.json +0 -0
  140. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/d3@7.8.5 +0 -0
  141. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/stdlib.js@5.8.3 +0 -0
  142. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/vega-embed@6.20.2 +0 -0
  143. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/vega-lite@5.2.0 +0 -0
  144. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/external_js/vega@5.21.0 +0 -0
  145. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/labelling_tool/slt.js +0 -0
  146. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/labelling_tool/template.j2 +0 -0
  147. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/spark_jars/scala-udf-similarity-0.1.0_classic.jar +0 -0
  148. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/spark_jars/scala-udf-similarity-0.1.0_spark3.3.jar +0 -0
  149. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/spark_jars/scala-udf-similarity-0.1.1_spark3.x.jar +0 -0
  150. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_cluster_studio/cluster_template.j2 +0 -0
  151. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_cluster_studio/custom.css +0 -0
  152. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_comparison_viewer/custom.css +0 -0
  153. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_comparison_viewer/template.j2 +0 -0
  154. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/splink_vis_utils/splink_vis_utils.js +0 -0
  155. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/files/templates/single_chart_template.html +0 -0
  156. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/logging_messages.py +0 -0
  157. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/match_key_analysis.py +0 -0
  158. {splink-4.0.0.dev2 → splink-4.0.0.dev4}/splink/spark/jar_location.py +0 -0
@@ -1,34 +1,36 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: splink
3
- Version: 4.0.0.dev2
3
+ Version: 4.0.0.dev4
4
4
  Summary: Fast probabilistic data linkage at scale
5
5
  Home-page: https://github.com/moj-analytical-services/splink
6
6
  License: MIT
7
7
  Author: Robin Linacre
8
8
  Author-email: robinlinacre@hotmail.com
9
- Requires-Python: >=3.7.1,<4.0.0
9
+ Requires-Python: >=3.8.0,<4.0.0
10
10
  Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.8
13
13
  Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
16
17
  Provides-Extra: athena
17
18
  Provides-Extra: postgres
18
19
  Provides-Extra: pyspark
19
20
  Provides-Extra: spark
20
21
  Requires-Dist: Jinja2 (>=3.0.3)
21
22
  Requires-Dist: altair (>=5.0.1,<6.0.0)
22
- Requires-Dist: awswrangler (==2.18.0) ; (python_full_version >= "3.7.1" and python_version < "3.8") and (extra == "athena")
23
- Requires-Dist: awswrangler (>=3.0.0,<4.0.0) ; (python_version >= "3.8" and python_version < "4.0") and (extra == "athena")
24
- Requires-Dist: duckdb (>=0.8.0)
25
- Requires-Dist: jsonschema (>=3.2,<5.0)
26
- Requires-Dist: pandas (>1.3.0)
27
- Requires-Dist: phonetics (>=1.0.5,<2.0.0)
23
+ Requires-Dist: awswrangler (>=3.0.0,<4.0.0) ; (python_version >= "3.8") and (extra == "athena")
24
+ Requires-Dist: duckdb (>=0.9.2)
25
+ Requires-Dist: igraph (>=0.11.2) ; python_version >= "3.8"
26
+ Requires-Dist: jsonschema (>=3.2)
27
+ Requires-Dist: numpy (>=1.17.3) ; python_version < "3.12"
28
+ Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
29
+ Requires-Dist: pandas (>1.3.5)
30
+ Requires-Dist: phonetics (>=1.0.5)
28
31
  Requires-Dist: psycopg2-binary (>=2.8.0) ; extra == "postgres"
29
- Requires-Dist: pyspark (>=3.2.1,<4.0.0) ; extra == "pyspark" or extra == "spark"
30
- Requires-Dist: sqlalchemy (>=1.4.0,<2.0.0) ; extra == "postgres"
31
- Requires-Dist: sqlglot (>=13.0.0,<19.0.0)
32
+ Requires-Dist: pyspark (>=3.2.1) ; extra == "pyspark" or extra == "spark"
33
+ Requires-Dist: sqlglot (>=13.0.0)
32
34
  Project-URL: Repository, https://github.com/moj-analytical-services/splink
33
35
  Description-Content-Type: text/markdown
34
36
 
@@ -40,7 +42,8 @@ Description-Content-Type: text/markdown
40
42
  [![Downloads](https://static.pepy.tech/badge/splink/month)](https://pepy.tech/project/splink)
41
43
  [![Documentation](https://img.shields.io/badge/API-documentation-blue)](https://moj-analytical-services.github.io/splink/)
42
44
 
43
-
45
+ > [!IMPORTANT]
46
+ > Development has begun on Splink 4 on the `splink4_dev` branch. Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/03/19/splink4.html).
44
47
 
45
48
  # Fast, accurate and scalable probabilistic data linkage
46
49
 
@@ -54,7 +57,7 @@ Splink is a Python package for probabilistic record linkage (entity resolution)
54
57
  🎓 **Unsupervised Learning:** No training data is required for model training.
55
58
  📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.
56
59
 
57
- Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customizations to improve accuracy.
60
+ Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customisations to improve accuracy.
58
61
 
59
62
  ## What does Splink do?
60
63
 
@@ -72,7 +75,7 @@ and clusters these links to produce an estimated person ID:
72
75
 
73
76
  ## What data does Splink work best with?
74
77
 
75
- Before using Splink, input data should be standardized, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
78
+ Before using Splink, input data should be standardised, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
76
79
 
77
80
  Splink performs best with input data containing **multiple** columns that are **not highly correlated**. For instance, if the entity type is persons, you may have columns for full name, date of birth, and city. If the entity type is companies, you could have columns for name, turnover, sector, and telephone number.
78
81
 
@@ -104,39 +107,33 @@ or, if you prefer, you can instead install splink using conda:
104
107
  conda install -c conda-forge splink
105
108
  ```
106
109
 
107
- <details>
108
- <summary><h3>Additional installation methods</h3></summary>
110
+ ### Installing Splink for Specific Backends
109
111
 
110
- <br>
111
112
 
112
- ### Backend Specific Installs
113
- From Splink v3.9.7, packages required by specific splink backends can be optionally installed by adding the `[<backend>]` suffix to the end of your `pip install`.
113
+ For projects requiring specific backends, Splink offers optional installations for **Spark**, **Athena**, and **PostgreSQL**. These can be installed by appending the backend name in brackets to the pip install command:
114
+ ```sh
115
+ pip install 'splink[{backend}]'
116
+ ```
114
117
 
115
- **Note** that **SQLite** and **DuckDB** come packaged with Splink and do not need to be optionally installed.
118
+ Should you require a version of Splink without **DuckDB**, see our section on [DuckDBLess Splink Installation](https://moj-analytical-services.github.io/splink/installations.html#duckdb-less-installation).
116
119
 
117
- Backends supported by optional installs:
120
+ <details>
121
+ <summary><i>Click here for backend-specific installation commands</i></summary>
118
122
 
119
- **Spark**
123
+ #### Spark
120
124
  ```sh
121
125
  pip install 'splink[spark]'
122
126
  ```
123
127
 
124
- **Athena**
128
+ #### Athena
125
129
  ```sh
126
130
  pip install 'splink[athena]'
127
131
  ```
128
132
 
129
- **PostgreSQL**
133
+ #### PostgreSQL
130
134
  ```sh
131
135
  pip install 'splink[postgres]'
132
136
  ```
133
-
134
- <br>
135
-
136
- ### DuckDBLess Splink
137
- Should you require a more bare-bones version of Splink **without DuckDB**, please see the following area of the docs:
138
- > [DuckDBless Splink Installation](https://moj-analytical-services.github.io/splink/installations.html#duckdb-less-installation)
139
-
140
137
  </details>
141
138
 
142
139
  ## Quickstart
@@ -192,23 +189,29 @@ clusters.as_pandas_dataframe(limit=5)
192
189
 
193
190
  ## Charts Gallery
194
191
 
195
- You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](./charts/index.md).
192
+ You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](https://moj-analytical-services.github.io/splink/charts/index.html).
196
193
 
197
194
  ## Support
198
195
 
199
196
  To find the best place to ask a question, report a bug or get general advice, please refer to our [Contributing Guide](./CONTRIBUTING.md).
200
197
 
198
+ ## Use Cases
199
+
200
+ To see how users are using Splink in the wild, check out the [Use Cases](https://moj-analytical-services.github.io/splink/#use-cases) section of the docs.
201
+
201
202
  ## Awards
202
203
 
203
- 🥇 Analysis in Government Awards 2020: Innovative Methods - [Winner](https://www.gov.uk/government/news/launch-of-the-analysis-in-government-awards)
204
+ Future of Government Awards 2023: Open Source Creation - [Shortlisted, result to be announced shortly](https://futureofgovernment.com/en)
204
205
 
205
- 🥇 MoJ DASD Awards 2020: Innovation and Impact - Winner
206
+ 🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
206
207
 
207
208
  🥇 Analysis in Government Awards 2022: People's Choice Award - [Winner](https://analysisfunction.civilservice.gov.uk/news/announcing-the-winner-of-the-first-analysis-in-government-peoples-choice-award/)
208
209
 
209
210
  🥈 Analysis in Government Awards 2022: Innovative Methods - [Runner up](https://twitter.com/gov_analysis/status/1616073633692274689?s=20&t=6TQyNLJRjnhsfJy28Zd6UQ)
210
211
 
211
- 🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
212
+ 🥇 Analysis in Government Awards 2020: Innovative Methods - [Winner](https://www.gov.uk/government/news/launch-of-the-analysis-in-government-awards)
213
+
214
+ 🥇 MoJ Data and Analytical Services Directorate (DASD) Awards 2020: Innovation and Impact - Winner
212
215
 
213
216
 
214
217
  ## Citation
@@ -6,7 +6,8 @@
6
6
  [![Downloads](https://static.pepy.tech/badge/splink/month)](https://pepy.tech/project/splink)
7
7
  [![Documentation](https://img.shields.io/badge/API-documentation-blue)](https://moj-analytical-services.github.io/splink/)
8
8
 
9
-
9
+ > [!IMPORTANT]
10
+ > Development has begun on Splink 4 on the `splink4_dev` branch. Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/03/19/splink4.html).
10
11
 
11
12
  # Fast, accurate and scalable probabilistic data linkage
12
13
 
@@ -20,7 +21,7 @@ Splink is a Python package for probabilistic record linkage (entity resolution)
20
21
  🎓 **Unsupervised Learning:** No training data is required for model training.
21
22
  📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.
22
23
 
23
- Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customizations to improve accuracy.
24
+ Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customisations to improve accuracy.
24
25
 
25
26
  ## What does Splink do?
26
27
 
@@ -38,7 +39,7 @@ and clusters these links to produce an estimated person ID:
38
39
 
39
40
  ## What data does Splink work best with?
40
41
 
41
- Before using Splink, input data should be standardized, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
42
+ Before using Splink, input data should be standardised, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
42
43
 
43
44
  Splink performs best with input data containing **multiple** columns that are **not highly correlated**. For instance, if the entity type is persons, you may have columns for full name, date of birth, and city. If the entity type is companies, you could have columns for name, turnover, sector, and telephone number.
44
45
 
@@ -70,39 +71,33 @@ or, if you prefer, you can instead install splink using conda:
70
71
  conda install -c conda-forge splink
71
72
  ```
72
73
 
73
- <details>
74
- <summary><h3>Additional installation methods</h3></summary>
74
+ ### Installing Splink for Specific Backends
75
75
 
76
- <br>
77
76
 
78
- ### Backend Specific Installs
79
- From Splink v3.9.7, packages required by specific splink backends can be optionally installed by adding the `[<backend>]` suffix to the end of your `pip install`.
77
+ For projects requiring specific backends, Splink offers optional installations for **Spark**, **Athena**, and **PostgreSQL**. These can be installed by appending the backend name in brackets to the pip install command:
78
+ ```sh
79
+ pip install 'splink[{backend}]'
80
+ ```
80
81
 
81
- **Note** that **SQLite** and **DuckDB** come packaged with Splink and do not need to be optionally installed.
82
+ Should you require a version of Splink without **DuckDB**, see our section on [DuckDBLess Splink Installation](https://moj-analytical-services.github.io/splink/installations.html#duckdb-less-installation).
82
83
 
83
- Backends supported by optional installs:
84
+ <details>
85
+ <summary><i>Click here for backend-specific installation commands</i></summary>
84
86
 
85
- **Spark**
87
+ #### Spark
86
88
  ```sh
87
89
  pip install 'splink[spark]'
88
90
  ```
89
91
 
90
- **Athena**
92
+ #### Athena
91
93
  ```sh
92
94
  pip install 'splink[athena]'
93
95
  ```
94
96
 
95
- **PostgreSQL**
97
+ #### PostgreSQL
96
98
  ```sh
97
99
  pip install 'splink[postgres]'
98
100
  ```
99
-
100
- <br>
101
-
102
- ### DuckDBLess Splink
103
- Should you require a more bare-bones version of Splink **without DuckDB**, please see the following area of the docs:
104
- > [DuckDBless Splink Installation](https://moj-analytical-services.github.io/splink/installations.html#duckdb-less-installation)
105
-
106
101
  </details>
107
102
 
108
103
  ## Quickstart
@@ -158,23 +153,29 @@ clusters.as_pandas_dataframe(limit=5)
158
153
 
159
154
  ## Charts Gallery
160
155
 
161
- You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](./charts/index.md).
156
+ You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](https://moj-analytical-services.github.io/splink/charts/index.html).
162
157
 
163
158
  ## Support
164
159
 
165
160
  To find the best place to ask a question, report a bug or get general advice, please refer to our [Contributing Guide](./CONTRIBUTING.md).
166
161
 
162
+ ## Use Cases
163
+
164
+ To see how users are using Splink in the wild, check out the [Use Cases](https://moj-analytical-services.github.io/splink/#use-cases) section of the docs.
165
+
167
166
  ## Awards
168
167
 
169
- 🥇 Analysis in Government Awards 2020: Innovative Methods - [Winner](https://www.gov.uk/government/news/launch-of-the-analysis-in-government-awards)
168
+ Future of Government Awards 2023: Open Source Creation - [Shortlisted, result to be announced shortly](https://futureofgovernment.com/en)
170
169
 
171
- 🥇 MoJ DASD Awards 2020: Innovation and Impact - Winner
170
+ 🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
172
171
 
173
172
  🥇 Analysis in Government Awards 2022: People's Choice Award - [Winner](https://analysisfunction.civilservice.gov.uk/news/announcing-the-winner-of-the-first-analysis-in-government-peoples-choice-award/)
174
173
 
175
174
  🥈 Analysis in Government Awards 2022: Innovative Methods - [Runner up](https://twitter.com/gov_analysis/status/1616073633692274689?s=20&t=6TQyNLJRjnhsfJy28Zd6UQ)
176
175
 
177
- 🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
176
+ 🥇 Analysis in Government Awards 2020: Innovative Methods - [Winner](https://www.gov.uk/government/news/launch-of-the-analysis-in-government-awards)
177
+
178
+ 🥇 MoJ Data and Analytical Services Directorate (DASD) Awards 2020: Innovation and Impact - Winner
178
179
 
179
180
 
180
181
  ## Citation
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "splink"
3
- version = "4.0.0.dev2"
3
+ version = "4.0.0.dev4"
4
4
  description = "Fast probabilistic data linkage at scale"
5
5
  authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"]
6
6
  license = "MIT"
@@ -9,70 +9,63 @@ repository = "https://github.com/moj-analytical-services/splink"
9
9
  readme = "README.md"
10
10
 
11
11
  [tool.poetry.dependencies]
12
- python = ">=3.7.1,<4.0.0"
13
- jsonschema = ">=3.2,<5.0"
12
+ python = ">=3.8.0,<4.0.0"
13
+ jsonschema = ">=3.2"
14
14
  # 1.3.5 is the last version supporting py 3.7.1
15
- pandas = ">1.3.0"
16
- duckdb = ">=0.8.0"
17
- sqlglot = ">=13.0.0, <19.0.0"
15
+ pandas = ">1.3.5"
16
+ duckdb = ">=0.9.2"
17
+ sqlglot = ">=13.0.0"
18
18
  altair = "^5.0.1"
19
19
  Jinja2 = ">=3.0.3"
20
- phonetics = "^1.0.5"
20
+ phonetics = ">=1.0.5"
21
+
22
+ # need to manually specify numpy versions suitable for CI
23
+ # 1.24.4 works with python 3.8, but not 3.12
24
+ numpy = [
25
+ # version is minimum valid with above listed pandas version
26
+ {version=">=1.17.3", python = "<3.12"},
27
+ {version=">=1.26.0", python = ">=3.12"},
28
+ ]
29
+
21
30
 
22
31
  # Optional installs
23
- pyspark = {version="^3.2.1", optional=true}
32
+ # python >=3.12 requires pyspark >=4.0.0 - currently unreleased
33
+ pyspark = {version=">=3.2.1", optional=true}
24
34
 
25
35
  awswrangler = [
26
- {version = "2.18.0,<3.0.0", python = ">=3.7.1,<3.8", optional=true},
27
- {version=">=3.0.0,<4.0.0", python = "^3.8", optional=true}
36
+ {version=">=3.0.0,<4.0.0", python = ">=3.8", optional=true}
28
37
  ]
29
38
 
30
- # sqlalchemy >= 2.0.0 not working well with older pandas
31
- sqlalchemy = {version=">=1.4.0,<2.0.0", optional=true}
32
39
  psycopg2-binary = {version=">=2.8.0", optional=true}
33
40
 
41
+ # for graph metrics
42
+ igraph = { version = ">=0.11.2", python = ">=3.8", optional=true }
43
+
34
44
  [tool.poetry.group.dev]
35
45
  [tool.poetry.group.dev.dependencies]
36
- tabulate = "0.8.9"
37
- pyspark = "^3.2.1"
38
- # sqlalchemy >= 2.0.0 not working well with older pandas
39
- sqlalchemy = ">=1.4.0,<2.0.0"
46
+ tabulate = ">=0.8.9"
47
+ pyspark = ">=3.2.1"
48
+ sqlalchemy = ">=1.4.0"
40
49
  # temporarily use binary version, to avoid issues with pg_config path
41
50
  psycopg2-binary = ">=2.8.0"
51
+ igraph = ">=0.11.2"
42
52
 
43
53
  [tool.poetry.group.linting]
44
54
  [tool.poetry.group.linting.dependencies]
45
- black = "22.6.0"
46
- ruff = "0.0.257"
55
+ ruff = "^0.4.2"
47
56
 
48
57
  [tool.poetry.group.testing]
49
58
  [tool.poetry.group.testing.dependencies]
50
59
  # pin to reduce dependencies
51
- pytest = "7.3"
60
+ pytest = ">=7.3"
52
61
  pyarrow = ">=7.0.0"
53
- networkx = "2.5.1"
54
- rapidfuzz = "^2.0.3"
55
-
56
- [tool.poetry.group.benchmarking]
57
- optional = true
58
- [tool.poetry.group.benchmarking.dependencies]
59
- pytest-benchmark = "^4"
60
- lzstring = "1.0.4"
62
+ networkx = ">=2.5.1"
63
+ rapidfuzz = ">=2.0.3"
61
64
 
62
65
  [tool.poetry.group.typechecking]
63
66
  optional = true
64
67
  [tool.poetry.group.typechecking.dependencies]
65
- mypy = "1.7.0"
66
-
67
- [tool.poetry.group.demos]
68
- [tool.poetry.group.demos.dependencies]
69
- importlib-resources = "5.4.0"
70
- jupyterlab = "3.6.1"
71
- pyarrow = ">=7.0.0"
72
- ipywidgets = "8.0.4"
73
- nbmake = "1.3.4"
74
- pytest = "^7.0"
75
- pyspark = "^3.2.1"
68
+ mypy = "1.9.0"
76
69
 
77
70
  [tool.poetry.extras]
78
71
  pyspark = ["pyspark"]
@@ -89,7 +82,7 @@ profile = "black"
89
82
 
90
83
  [tool.ruff]
91
84
  line-length = 88
92
- select = [
85
+ lint.select = [
93
86
  # Pyflakes
94
87
  "F",
95
88
  # Pycodestyle
@@ -102,7 +95,7 @@ select = [
102
95
  # flake8-print
103
96
  "T20"
104
97
  ]
105
- ignore = [
98
+ lint.ignore = [
106
99
  "B905", # `zip()` without an explicit `strict=` parameter
107
100
  "B006", # Do not use mutable data structures for argument defaults"
108
101
  ]
@@ -122,22 +115,33 @@ markers = [
122
115
  "spark_only",
123
116
  "sqlite",
124
117
  "sqlite_only",
118
+ "postgres",
119
+ "postgres_only",
125
120
  ]
126
121
 
127
122
  [tool.mypy]
128
123
  packages = "splink"
129
- # temporary exclusions
130
- exclude = [
131
- # modules getting substantial rewrites:
132
- '.*comparison_imports\.py$',
133
- '.*comparison.*library\.py',
134
- 'comparison_level_composition',
135
- # modules with large number of errors
136
- '.*linker\.py',
137
- ]
138
124
  # for now at least allow implicit optionals
139
125
  # to cut down on noise. Easy to fix.
140
126
  implicit_optional = true
141
127
  # for now, ignore missing imports
142
128
  # can remove later and install stubs, where existent
143
129
  ignore_missing_imports = true
130
+
131
+ # options for strict mode
132
+ # too much to handle at once, so opt-in a little at a time
133
+ # https://mypy.readthedocs.io/en/stable/existing_code.html#introduce-stricter-options
134
+ warn_unused_configs = true
135
+ warn_redundant_casts = true
136
+ warn_unused_ignores = true
137
+ strict_equality = true
138
+ # don't worry about warning: https://github.com/python/mypy/issues/16189
139
+ strict_concatenate = true
140
+ check_untyped_defs = true
141
+ disallow_subclassing_any = true
142
+ disallow_untyped_decorators = true
143
+ disallow_any_generics = true
144
+ # further strict checks to add in:
145
+ # disallow_untyped_calls = true
146
+ disallow_incomplete_defs = true
147
+ # disallow_untyped_defs = true
@@ -0,0 +1,60 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ # Explicitly declare exported names to avoid 'imported but unused' linting issues
4
+ __all__ = [
5
+ "block_on",
6
+ "splink_datasets",
7
+ "Linker",
8
+ "SettingsCreator",
9
+ "SQLiteAPI",
10
+ "SparkAPI",
11
+ "DuckDBAPI",
12
+ "PostgresAPI",
13
+ ]
14
+
15
+
16
+ from splink.blocking_rule_library import block_on
17
+ from splink.datasets import splink_datasets
18
+ from splink.linker import Linker
19
+ from splink.settings_creator import SettingsCreator
20
+ from splink.sqlite.database_api import SQLiteAPI
21
+
22
+ # The following is a workaround for the fact that dependencies of postgres, spark
23
+ # and duckdb may not be installed, but we don't want this to prevent import
24
+ # of the other backends.
25
+
26
+ # This enables auto-complete to be used to import the various DBAPIs
27
+ # and ensures that typing information is retained so e.g. the arguments autocomplete
28
+ # without importing them at runtime
29
+ if TYPE_CHECKING:
30
+ from splink.duckdb.database_api import DuckDBAPI
31
+ from splink.postgres.database_api import PostgresAPI
32
+ from splink.spark.database_api import SparkAPI
33
+
34
+
35
+ # Use getarr to make the error appear at the point of use
36
+ def __getattr__(name):
37
+ try:
38
+ if name == "SparkAPI":
39
+ from splink.spark.database_api import SparkAPI
40
+
41
+ return SparkAPI
42
+ elif name == "DuckDBAPI":
43
+ from splink.duckdb.database_api import DuckDBAPI
44
+
45
+ return DuckDBAPI
46
+ elif name == "PostgresAPI":
47
+ from splink.postgres.database_api import PostgresAPI
48
+
49
+ return PostgresAPI
50
+ except ImportError as err:
51
+ if name in ["SparkAPI", "DuckDBAPI", "PostgresAPI"]:
52
+ raise ImportError(
53
+ f"{name} cannot be imported because its dependencies are not "
54
+ "installed. Please `pip install` the required package(s) as "
55
+ "specified in the optional dependencies in pyproject.toml"
56
+ ) from err
57
+ raise AttributeError(f"module 'splink' has no attribute '{name}'") from None
58
+
59
+
60
+ __version__ = "4.0.0.dev4"