splink 4.0.0.dev8__tar.gz → 4.0.0.dev9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/PKG-INFO +1 -1
  2. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/pyproject.toml +1 -1
  3. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/__init__.py +4 -11
  4. splink-4.0.0.dev9/splink/backends/duckdb.py +3 -0
  5. splink-4.0.0.dev9/splink/backends/postgres.py +3 -0
  6. splink-4.0.0.dev9/splink/backends/spark.py +4 -0
  7. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/accuracy.py +5 -5
  8. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_creator.py +39 -13
  9. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level_composition.py +0 -1
  10. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level_creator.py +56 -22
  11. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level_library.py +5 -2
  12. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_library.py +0 -37
  13. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/database_api.py +3 -10
  14. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/dialects.py +0 -1
  15. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/em_training_session.py +17 -30
  16. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker.py +9 -9
  17. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/evaluation.py +57 -32
  18. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/inference.py +4 -4
  19. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/misc.py +3 -1
  20. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/table_management.py +1 -2
  21. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/training.py +3 -66
  22. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/database_api.py +12 -28
  23. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/splink_dataframe.py +2 -2
  24. splink-4.0.0.dev8/splink/backends/spark.py +0 -3
  25. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/LICENSE +0 -0
  26. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/README.md +0 -0
  27. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/backends/athena.py +0 -0
  28. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/backends/sqlite.py +0 -0
  29. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/blocking_analysis.py +0 -0
  30. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/comparison_level_library.py +0 -0
  31. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/comparison_library.py +0 -0
  32. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/datasets.py +0 -0
  33. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/exploratory.py +0 -0
  34. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/__init__.py +0 -0
  35. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/__init__.py +0 -0
  36. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/athena_helpers/__init__.py +0 -0
  37. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/athena_helpers/athena_transforms.py +0 -0
  38. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/athena_helpers/athena_utils.py +0 -0
  39. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/database_api.py +0 -0
  40. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/dataframe.py +0 -0
  41. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/block_from_labels.py +0 -0
  42. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking.py +0 -0
  43. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking_analysis.py +0 -0
  44. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking_rule_creator.py +0 -0
  45. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking_rule_creator_utils.py +0 -0
  46. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking_rule_library.py +0 -0
  47. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/cache_dict_with_logging.py +0 -0
  48. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/charts.py +0 -0
  49. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/cluster_studio.py +0 -0
  50. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/column_expression.py +0 -0
  51. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison.py +0 -0
  52. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level.py +0 -0
  53. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level_sql.py +0 -0
  54. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_vector_distribution.py +0 -0
  55. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_vector_values.py +0 -0
  56. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/completeness.py +0 -0
  57. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/connected_components.py +0 -0
  58. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/constants.py +0 -0
  59. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/cost_of_blocking_rules.py +0 -0
  60. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/databricks/__init__.py +0 -0
  61. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/databricks/enable_splink.py +0 -0
  62. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/datasets/__init__.py +0 -0
  63. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/default_from_jsonschema.py +0 -0
  64. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/__init__.py +0 -0
  65. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/database_api.py +0 -0
  66. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/dataframe.py +0 -0
  67. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/duckdb_helpers/__init__.py +0 -0
  68. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/duckdb_helpers/duckdb_helpers.py +0 -0
  69. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/edge_metrics.py +0 -0
  70. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/estimate_u.py +0 -0
  71. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/exceptions.py +0 -0
  72. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/expectation_maximisation.py +0 -0
  73. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/DEPENDENCY_LICENSES.txt +0 -0
  74. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/accuracy_chart.json +0 -0
  75. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/blocking_rule_generated_comparisons.json +0 -0
  76. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/comparator_score_chart.json +0 -0
  77. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/comparator_score_threshold_chart.json +0 -0
  78. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/completeness.json +0 -0
  79. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/m_u_parameters_interactive_history.json +0 -0
  80. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/match_weight_histogram.json +0 -0
  81. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/match_weights_interactive_history.json +0 -0
  82. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/match_weights_waterfall.json +0 -0
  83. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/missingness.json +0 -0
  84. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/parameter_estimate_comparisons.json +0 -0
  85. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/phonetic_match_chart.json +0 -0
  86. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/precision_recall.json +0 -0
  87. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/probability_two_random_records_match_iteration.json +0 -0
  88. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/profile_data.json +0 -0
  89. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/roc.json +0 -0
  90. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/tf_adjustment_chart.json +0 -0
  91. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/threshold_selection_tool.json +0 -0
  92. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/unlinkables_chart_def.json +0 -0
  93. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/d3@7.8.5 +0 -0
  94. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/stdlib.js@5.8.3 +0 -0
  95. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/vega-embed@6.20.2 +0 -0
  96. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/vega-lite@5.2.0 +0 -0
  97. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/vega@5.21.0 +0 -0
  98. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/labelling_tool/slt.js +0 -0
  99. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/labelling_tool/template.j2 +0 -0
  100. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/settings_jsonschema.json +0 -0
  101. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_classic.jar +0 -0
  102. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_spark3.3.jar +0 -0
  103. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.1_spark3.x.jar +0 -0
  104. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_cluster_studio/cluster_template.j2 +0 -0
  105. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_cluster_studio/custom.css +0 -0
  106. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_comparison_viewer/custom.css +0 -0
  107. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_comparison_viewer/template.j2 +0 -0
  108. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_vis_utils/splink_vis_utils.js +0 -0
  109. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/templates/single_chart_template.html +0 -0
  110. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/find_brs_with_comparison_counts_below_threshold.py +0 -0
  111. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/find_matches_to_new_records.py +0 -0
  112. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/graph_metrics.py +0 -0
  113. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/input_column.py +0 -0
  114. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/labelling_tool.py +0 -0
  115. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/__init__.py +0 -0
  116. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/clustering.py +0 -0
  117. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/visualisations.py +0 -0
  118. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/logging_messages.py +0 -0
  119. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/lower_id_on_lhs.py +0 -0
  120. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/m_from_labels.py +0 -0
  121. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/m_training.py +0 -0
  122. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/m_u_records_to_parameters.py +0 -0
  123. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/match_key_analysis.py +0 -0
  124. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/match_weights_histogram.py +0 -0
  125. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/misc.py +0 -0
  126. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/optimise_cost_of_brs.py +0 -0
  127. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/parse_sql.py +0 -0
  128. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/pipeline.py +0 -0
  129. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/postgres/__init__.py +0 -0
  130. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/postgres/database_api.py +0 -0
  131. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/postgres/dataframe.py +0 -0
  132. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/predict.py +0 -0
  133. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/profile_data.py +0 -0
  134. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings.py +0 -0
  135. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_creator.py +0 -0
  136. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/__init__.py +0 -0
  137. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/log_invalid_columns.py +0 -0
  138. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/settings_column_cleaner.py +0 -0
  139. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/settings_validation_log_strings.py +0 -0
  140. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/valid_types.py +0 -0
  141. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/similarity_analysis.py +0 -0
  142. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/__init__.py +0 -0
  143. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/dataframe.py +0 -0
  144. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/jar_location.py +0 -0
  145. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/spark_helpers/__init__.py +0 -0
  146. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/spark_helpers/custom_spark_dialect.py +0 -0
  147. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/splink_comparison_viewer.py +0 -0
  148. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/sql_transform.py +0 -0
  149. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/sqlite/__init__.py +0 -0
  150. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/sqlite/database_api.py +0 -0
  151. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/sqlite/dataframe.py +0 -0
  152. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/term_frequencies.py +0 -0
  153. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/unique_id_concat.py +0 -0
  154. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/unlinkables.py +0 -0
  155. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/validate_jsonschema.py +0 -0
  156. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/vertically_concatenate.py +0 -0
  157. {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/waterfall_chart.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: splink
3
- Version: 4.0.0.dev8
3
+ Version: 4.0.0.dev9
4
4
  Summary: Fast probabilistic data linkage at scale
5
5
  Home-page: https://github.com/moj-analytical-services/splink
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "splink"
3
- version = "4.0.0.dev8"
3
+ version = "4.0.0.dev9"
4
4
  description = "Fast probabilistic data linkage at scale"
5
5
  authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"]
6
6
  license = "MIT"
@@ -6,8 +6,8 @@ from splink.internals.datasets import splink_datasets
6
6
  from splink.internals.linker import Linker
7
7
  from splink.internals.settings_creator import SettingsCreator
8
8
 
9
- # The following is a workaround for the fact that dependencies of postgres, spark
10
- # and duckdb may not be installed, but we don't want this to prevent import
9
+ # The following is a workaround for the fact that dependencies of particular backends
10
+ # may not be installed, but we don't want this to prevent import
11
11
  # of the other backends.
12
12
 
13
13
  # This enables auto-complete to be used to import the various DBAPIs
@@ -15,7 +15,6 @@ from splink.internals.settings_creator import SettingsCreator
15
15
  # without importing them at runtime
16
16
  if TYPE_CHECKING:
17
17
  from splink.internals.duckdb.database_api import DuckDBAPI
18
- from splink.internals.postgres.database_api import PostgresAPI
19
18
  from splink.internals.spark.database_api import SparkAPI
20
19
 
21
20
 
@@ -30,12 +29,8 @@ def __getattr__(name):
30
29
  from splink.internals.duckdb.database_api import DuckDBAPI
31
30
 
32
31
  return DuckDBAPI
33
- elif name == "PostgresAPI":
34
- from splink.internals.postgres.database_api import PostgresAPI
35
-
36
- return PostgresAPI
37
32
  except ImportError as err:
38
- if name in ["SparkAPI", "DuckDBAPI", "PostgresAPI"]:
33
+ if name in ["SparkAPI", "DuckDBAPI"]:
39
34
  raise ImportError(
40
35
  f"{name} cannot be imported because its dependencies are not "
41
36
  "installed. Please `pip install` the required package(s) as "
@@ -44,7 +39,7 @@ def __getattr__(name):
44
39
  raise AttributeError(f"module 'splink' has no attribute '{name}'") from None
45
40
 
46
41
 
47
- __version__ = "4.0.0.dev8"
42
+ __version__ = "4.0.0.dev9"
48
43
 
49
44
 
50
45
  __all__ = [
@@ -52,9 +47,7 @@ __all__ = [
52
47
  "ColumnExpression",
53
48
  "DuckDBAPI",
54
49
  "Linker",
55
- "PostgresAPI",
56
50
  "SettingsCreator",
57
51
  "SparkAPI",
58
52
  "splink_datasets",
59
- "SQLiteAPI",
60
53
  ]
@@ -0,0 +1,3 @@
1
+ from splink.internals.duckdb.database_api import DuckDBAPI
2
+
3
+ __all__ = ["DuckDBAPI"]
@@ -0,0 +1,3 @@
1
+ from splink.internals.postgres.database_api import PostgresAPI
2
+
3
+ __all__ = ["PostgresAPI"]
@@ -0,0 +1,4 @@
1
+ from splink.internals.spark.database_api import SparkAPI
2
+ from splink.internals.spark.jar_location import similarity_jar_location
3
+
4
+ __all__ = ["similarity_jar_location", "SparkAPI"]
@@ -446,7 +446,7 @@ def prediction_errors_from_labels_table(
446
446
  labels_tablename: str,
447
447
  include_false_positives: bool = True,
448
448
  include_false_negatives: bool = True,
449
- threshold: float = 0.5,
449
+ threshold_match_probability: float = 0.5,
450
450
  ) -> SplinkDataFrame:
451
451
  pipeline = CTEPipeline()
452
452
  nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
@@ -457,13 +457,13 @@ def prediction_errors_from_labels_table(
457
457
  pipeline.enqueue_list_of_sqls(sqls)
458
458
 
459
459
  false_positives = f"""
460
- (clerical_match_score < {threshold} and
461
- match_probability > {threshold})
460
+ (clerical_match_score < {threshold_match_probability} and
461
+ match_probability > {threshold_match_probability})
462
462
  """
463
463
 
464
464
  false_negatives = f"""
465
- (clerical_match_score > {threshold} and
466
- match_probability < {threshold})
465
+ (clerical_match_score > {threshold_match_probability} and
466
+ match_probability < {threshold_match_probability})
467
467
  """
468
468
 
469
469
  where_conditions = []
@@ -7,7 +7,11 @@ from splink.internals.column_expression import ColumnExpression
7
7
  from splink.internals.exceptions import SplinkException
8
8
 
9
9
  from .comparison import Comparison
10
- from .comparison_level_creator import ComparisonLevelCreator
10
+ from .comparison_level_creator import (
11
+ ComparisonLevelCreator,
12
+ UnsuppliedNoneOr,
13
+ unsupplied_option,
14
+ )
11
15
 
12
16
 
13
17
  class ComparisonCreator(ABC):
@@ -65,7 +69,6 @@ class ComparisonCreator(ABC):
65
69
  # create levels - let them raise errors if there are issues
66
70
  self.create_comparison_levels()
67
71
 
68
- # TODO: property?
69
72
  @abstractmethod
70
73
  def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
71
74
  pass
@@ -77,9 +80,11 @@ class ComparisonCreator(ABC):
77
80
 
78
81
  if self.term_frequency_adjustments:
79
82
  for cl in comparison_levels:
80
- # TODO: Check that the column name a 'pure' column name and
81
- # not a column expression with transforms applied
82
- if cl.is_exact_match_level:
83
+ if (
84
+ hasattr(cl, "col_expression")
85
+ and cl.col_expression.is_pure_column_or_column_reference
86
+ and cl.is_exact_match_level
87
+ ):
83
88
  cl.term_frequency_adjustments = True
84
89
 
85
90
  if self.m_probabilities:
@@ -145,23 +150,37 @@ class ComparisonCreator(ABC):
145
150
  def configure(
146
151
  self,
147
152
  *,
148
- term_frequency_adjustments: bool = False,
149
- m_probabilities: List[float] = None,
150
- u_probabilities: List[float] = None,
153
+ term_frequency_adjustments: UnsuppliedNoneOr[bool] = unsupplied_option,
154
+ m_probabilities: UnsuppliedNoneOr[List[float]] = unsupplied_option,
155
+ u_probabilities: UnsuppliedNoneOr[List[float]] = unsupplied_option,
151
156
  ) -> "ComparisonCreator":
152
157
  """
153
- Configure the comparison creator with m and u probabilities. The first
158
+ Configure the comparison creator with options that are common to all
159
+ comparisons.
160
+
161
+ For m and u probabilities, the first
154
162
  element in the list corresponds to the first comparison level, usually
155
163
  an exact match level. Subsequent elements correspond comparison to
156
164
  levels in sequential order, through to the last element which is usually
157
165
  the 'ELSE' level.
158
166
 
167
+ All options have default options set initially. Any call to `.configure()`
168
+ will set any options that are supplied. Any subsequent calls to `.configure()`
169
+ will not override these values with defaults; to override values you must
170
+ explicitly provide a value corresponding to the default.
171
+
172
+ Generally speaking only a single call (at most) to `.configure()` should
173
+ be required.
174
+
159
175
  Args:
160
176
  term_frequency_adjustments (bool, optional): Whether term frequency
161
177
  adjustments are switched on for this comparison. Only applied
162
- to exact match levels. Default: False
178
+ to exact match levels.
179
+ Default corresponds to False.
163
180
  m_probabilities (list, optional): List of m probabilities
181
+ Default corresponds to None.
164
182
  u_probabilities (list, optional): List of u probabilities
183
+ Default corresponds to None.
165
184
 
166
185
  Example:
167
186
  ```py
@@ -175,9 +194,16 @@ class ComparisonCreator(ABC):
175
194
  ```
176
195
 
177
196
  """
178
- self.term_frequency_adjustments = term_frequency_adjustments
179
- self.m_probabilities = m_probabilities
180
- self.u_probabilities = u_probabilities
197
+ configurables = {
198
+ "term_frequency_adjustments": term_frequency_adjustments,
199
+ "m_probabilities": m_probabilities,
200
+ "u_probabilities": u_probabilities,
201
+ }
202
+
203
+ for attribute_name, attribute_value in configurables.items():
204
+ if attribute_value is not unsupplied_option:
205
+ setattr(self, attribute_name, attribute_value)
206
+
181
207
  return self
182
208
 
183
209
  @property
@@ -15,7 +15,6 @@ def _ensure_is_comparison_level_creator(
15
15
  if isinstance(cl, dict):
16
16
  from .comparison_level_library import CustomLevel
17
17
 
18
- # TODO: proper dict => level method
19
18
  return CustomLevel(**cl)
20
19
  if isinstance(cl, ComparisonLevelCreator):
21
20
  return cl
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  from abc import ABC, abstractmethod
4
4
  from inspect import signature
5
- from typing import Any, final
5
+ from typing import Any, TypeVar, Union, final
6
6
 
7
7
  from splink.internals.column_expression import ColumnExpression
8
8
  from splink.internals.dialects import SplinkDialect
@@ -10,6 +10,22 @@ from splink.internals.dialects import SplinkDialect
10
10
  from .comparison_level import ComparisonLevel
11
11
 
12
12
 
13
+ class _UnsuppliedOption:
14
+ _instance: "_UnsuppliedOption" | None = None
15
+
16
+ def __new__(cls):
17
+ if cls._instance is None:
18
+ cls._instance = super(_UnsuppliedOption, cls).__new__(cls)
19
+ return cls._instance
20
+
21
+
22
+ unsupplied_option = _UnsuppliedOption()
23
+
24
+ T = TypeVar("T")
25
+ # type alias - either the specified type, _UnsuppliedOption, or None
26
+ UnsuppliedNoneOr = Union[T, _UnsuppliedOption, None]
27
+
28
+
13
29
  class ComparisonLevelCreator(ABC):
14
30
  # off by default - only a small subset should have tf adjustments
15
31
  term_frequency_adjustments = False
@@ -56,14 +72,14 @@ class ComparisonLevelCreator(ABC):
56
72
  def configure(
57
73
  self,
58
74
  *,
59
- m_probability: float = None,
60
- u_probability: float = None,
61
- tf_adjustment_column: str = None,
62
- tf_adjustment_weight: float = None,
63
- tf_minimum_u_value: float = None,
64
- is_null_level: bool = None,
65
- label_for_charts: str = None,
66
- disable_tf_exact_match_detection: bool = None,
75
+ m_probability: UnsuppliedNoneOr[float] = unsupplied_option,
76
+ u_probability: UnsuppliedNoneOr[float] = unsupplied_option,
77
+ tf_adjustment_column: UnsuppliedNoneOr[str] = unsupplied_option,
78
+ tf_adjustment_weight: UnsuppliedNoneOr[float] = unsupplied_option,
79
+ tf_minimum_u_value: UnsuppliedNoneOr[float] = unsupplied_option,
80
+ is_null_level: UnsuppliedNoneOr[bool] = unsupplied_option,
81
+ label_for_charts: UnsuppliedNoneOr[str] = unsupplied_option,
82
+ disable_tf_exact_match_detection: UnsuppliedNoneOr[bool] = unsupplied_option,
67
83
  ) -> "ComparisonLevelCreator":
68
84
  """
69
85
  Configure the comparison level with options which are common to all
@@ -71,29 +87,47 @@ class ComparisonLevelCreator(ABC):
71
87
  specification of a comparison level. These options are usually not
72
88
  needed, but are available for advanced users.
73
89
 
90
+ All options have default options set initially. Any call to `.configure()`
91
+ will set any options that are supplied. Any subsequent calls to `.configure()`
92
+ will not override these values with defaults; to override values you must must
93
+ explicitly provide a value corresponding to the default.
94
+
95
+ Generally speaking only a single call (at most) to `.configure()` should
96
+ be required.
74
97
 
75
98
  Args:
76
99
  m_probability (float, optional): The m probability for this
77
- comparison level. Defaults to None, meaning it is not set.
100
+ comparison level.
101
+ Default is equivalent to None, in which case a default initial value
102
+ will be provided for this level.
78
103
  u_probability (float, optional): The u probability for this
79
- comparison level. Defaults to None, meaning it is not set.
104
+ comparison level.
105
+ Default is equivalent to None, in which case a default initial value
106
+ will be provided for this level.
80
107
  tf_adjustment_column (str, optional): Make term frequency adjustments for
81
- this comparison level using this input column. Defaults to None,
82
- meaning term-frequency adjustments will not be applied for this level.
108
+ this comparison level using this input column.
109
+ Default is equivalent to None, meaning that term-frequency adjustments
110
+ will not be applied for this level.
83
111
  tf_adjustment_weight (float, optional): Make term frequency adjustments
84
- for this comparison level using this weight. Defaults to None,
85
- meaning term-frequency adjustments are fully-weighted if turned on.
112
+ for this comparison level using this weight.
113
+ Default is equivalent to None, meaning term-frequency adjustments are
114
+ fully-weighted if turned on.
86
115
  tf_minimum_u_value (float, optional): When term frequency adjustments are
87
116
  turned on, where the term frequency adjustment implies a u value below
88
- this value, use this minimum value instead. Defaults to None, meaning
89
- no minimum value.
117
+ this value, use this minimum value instead.
118
+ Defaults is equivalent to None, meaning no minimum value.
90
119
  is_null_level (bool, optional): If true, m and u values will not be
91
120
  estimated and instead the match weight will be zero for this column.
92
- Defaults to None, equivalent to False.
121
+ Default is equivalent to False.
93
122
  label_for_charts (str, optional): If provided, a custom label that will
94
- be used for this level in any charts. Defaults to None, in which case
95
- a default label will be provided.
96
-
123
+ be used for this level in any charts.
124
+ Default is equivalent to None, in which case a default label will be
125
+ provided for this level.
126
+ disable_tf_exact_match_detection (bool, optional): If true, if term
127
+ frequency adjustments are set, the corresponding adjustment will be
128
+ made using the u-value for _this_ level, rather than the usual case
129
+ where it is the u-value of the exact match level in the same comparison.
130
+ Default is equivalent to False.
97
131
  Returns:
98
132
  ComparisonLevelCreator: The instance of the ComparisonLevelCreator class
99
133
  with the updated configuration.
@@ -101,7 +135,7 @@ class ComparisonLevelCreator(ABC):
101
135
  args = locals()
102
136
  del args["self"]
103
137
  for k, v in args.items():
104
- if v is not None:
138
+ if v is not unsupplied_option:
105
139
  setattr(self, k, v)
106
140
 
107
141
  return self
@@ -256,8 +256,11 @@ class ExactMatchLevel(ComparisonLevelCreator):
256
256
  tf_adjustment_column=self.col_expression.raw_sql_expression,
257
257
  tf_adjustment_weight=1.0,
258
258
  )
259
-
260
- # TODO: how to 'turn off'?? configure doesn't currently allow
259
+ else:
260
+ self.configure(
261
+ tf_adjustment_column=None,
262
+ tf_adjustment_weight=None,
263
+ )
261
264
 
262
265
  def create_sql(self, sql_dialect: SplinkDialect) -> str:
263
266
  self.col_expression.sql_dialect = sql_dialect
@@ -615,7 +615,6 @@ class DateOfBirthComparison(ComparisonCreator):
615
615
  "year",
616
616
  ],
617
617
  datetime_format: str = None,
618
- separate_1st_january: bool = False,
619
618
  invalid_dates_as_null: bool = True,
620
619
  ):
621
620
  """
@@ -644,10 +643,6 @@ class DateOfBirthComparison(ComparisonCreator):
644
643
  Metrics for date differences. Defaults to ["month", "year", "year"].
645
644
  datetime_format (str, optional): The datetime format used to cast strings
646
645
  to dates. Only used if input is a string.
647
- separate_1st_january (bool, optional): Used for when date of birth is
648
- sometimes recorded as 1st of Jan when only the year is known / If True,
649
- a level is included for for a match on the year where at least one
650
- side of the match is a date on the the 1st of January.
651
646
  invalid_dates_as_null (bool, optional): If True, treat invalid dates as null
652
647
  as opposed to allowing e.g. an exact or levenshtein match where one side
653
648
  or both are an invalid date. Only used if input is a string. Defaults
@@ -672,8 +667,6 @@ class DateOfBirthComparison(ComparisonCreator):
672
667
 
673
668
  self.datetime_format = datetime_format
674
669
 
675
- self.separate_1st_january = separate_1st_january
676
-
677
670
  self.input_is_string = input_is_string
678
671
  self.invalid_dates_as_null = invalid_dates_as_null
679
672
 
@@ -693,36 +686,6 @@ class DateOfBirthComparison(ComparisonCreator):
693
686
  cll.NullLevel(null_col),
694
687
  ]
695
688
 
696
- if self.input_is_string:
697
- date_as_iso_string = self.datetime_parse_function(
698
- self.datetime_format
699
- ).cast_to_string()
700
- else:
701
- date_as_iso_string = self.col_expression.cast_to_string()
702
-
703
- if self.separate_1st_january:
704
- level = cll.And(
705
- cll.Or(
706
- cll.LiteralMatchLevel(
707
- date_as_iso_string.substr(6, 5),
708
- literal_value="01-01",
709
- literal_datatype="string",
710
- side_of_comparison="left",
711
- ),
712
- cll.LiteralMatchLevel(
713
- date_as_iso_string.substr(6, 5),
714
- literal_value="01-01",
715
- literal_datatype="string",
716
- side_of_comparison="right",
717
- ),
718
- ),
719
- cll.ExactMatchLevel(date_as_iso_string.substr(0, 4)),
720
- )
721
-
722
- level.configure(label_for_charts="Exact match on year, 1st Jan only")
723
-
724
- levels.append(level)
725
-
726
689
  levels.append(
727
690
  cll.ExactMatchLevel(self.col_expression).configure(
728
691
  label_for_charts="Exact match on date of birth"
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import hashlib
4
4
  import logging
5
- import random
6
5
  import time
7
6
  from abc import ABC, abstractmethod
8
7
  from collections.abc import Sequence
@@ -47,8 +46,7 @@ class DatabaseAPI(ABC, Generic[TablishType]):
47
46
 
48
47
  def __init__(self) -> None:
49
48
  self._intermediate_table_cache: CacheDictWithLogging = CacheDictWithLogging()
50
- # TODO: replace this:
51
- self._cache_uid: str = str(random.choice(range(10000)))
49
+ self._cache_uid: str = ascii_uid(8)
52
50
 
53
51
  @final
54
52
  def _log_and_run_sql_execution(
@@ -80,7 +78,6 @@ class DatabaseAPI(ABC, Generic[TablishType]):
80
78
  f"\n\nError was: {e}"
81
79
  ) from e
82
80
 
83
- # TODO: rename this?
84
81
  @final
85
82
  def _sql_to_splink_dataframe(
86
83
  self, sql: str, templated_name: str, physical_name: str
@@ -140,9 +137,8 @@ class DatabaseAPI(ABC, Generic[TablishType]):
140
137
  use_cache: bool = True,
141
138
  ) -> SplinkDataFrame:
142
139
  # differences from _sql_to_splink_dataframe:
143
- # this _calculates_ physical name, and
144
- # handles debug_mode
145
- # TODO: also maybe caching? but maybe that is even lower down
140
+ # this _calculates_ physical name, handles debug_mode,
141
+ # and checks cache before querying
146
142
  to_hash = (sql + self._cache_uid).encode("utf-8")
147
143
  hash = hashlib.sha256(to_hash).hexdigest()[:9]
148
144
  # Ensure hash is valid sql table name
@@ -342,9 +338,6 @@ class DatabaseAPI(ABC, Generic[TablishType]):
342
338
  input_tables = ensure_is_list(input_tables)
343
339
  return input_tables
344
340
 
345
- # should probably also be responsible for cache
346
- # TODO: stick this in a cache-api that lives on this
347
-
348
341
  def remove_splinkdataframe_from_cache(
349
342
  self, splink_dataframe: SplinkDataFrame
350
343
  ) -> None:
@@ -214,7 +214,6 @@ class DuckDBDialect(SplinkDialect):
214
214
  ) -> str:
215
215
  return f"regexp_extract({name}, '{pattern}', {capture_group})"
216
216
 
217
- # TODO: roll out to other dialects, at least for now
218
217
  @property
219
218
  def infinity_expression(self):
220
219
  return "cast('infinity' as float8)"
@@ -11,7 +11,6 @@ from splink.internals.charts import (
11
11
  probability_two_random_records_match_iteration_chart,
12
12
  )
13
13
  from splink.internals.comparison import Comparison
14
- from splink.internals.comparison_level import ComparisonLevel
15
14
  from splink.internals.comparison_vector_values import (
16
15
  compute_comparison_vector_values_from_id_pairs_sqls,
17
16
  )
@@ -57,8 +56,6 @@ class EMTrainingSession:
57
56
  fix_u_probabilities: bool = False,
58
57
  fix_m_probabilities: bool = False,
59
58
  fix_probability_two_random_records_match: bool = False,
60
- comparisons_to_deactivate: list[Comparison] = None,
61
- comparison_levels_to_reverse_blocking_rule: list[ComparisonLevel] = None,
62
59
  estimate_without_term_frequencies: bool = False,
63
60
  ):
64
61
  logger.info("\n----- Starting EM training session -----\n")
@@ -77,20 +74,13 @@ class EMTrainingSession:
77
74
  self._blocking_rule_for_training = blocking_rule_for_training
78
75
  self.estimate_without_term_frequencies = estimate_without_term_frequencies
79
76
 
80
- if comparison_levels_to_reverse_blocking_rule:
81
- # TODO: atm this branch probably makes no sense. What would user pass?
82
- # self._comparison_levels_to_reverse_blocking_rule = (
83
- # comparison_levels_to_reverse_blocking_rule
84
- # )
85
- raise ValueError("This path is broken for now.")
86
- else:
87
- self._comparison_levels_to_reverse_blocking_rule: list[
88
- ComparisonAndLevelDict
89
- ] = Settings._get_comparison_levels_corresponding_to_training_blocking_rule( # noqa
90
- blocking_rule_sql=blocking_rule_for_training.blocking_rule_sql,
91
- sqlglot_dialect_name=self.db_api.sql_dialect.sqlglot_name,
92
- comparisons=core_model_settings.comparisons,
93
- )
77
+ self._comparison_levels_to_reverse_blocking_rule: list[
78
+ ComparisonAndLevelDict
79
+ ] = Settings._get_comparison_levels_corresponding_to_training_blocking_rule( # noqa
80
+ blocking_rule_sql=blocking_rule_for_training.blocking_rule_sql,
81
+ sqlglot_dialect_name=self.db_api.sql_dialect.sqlglot_name,
82
+ comparisons=core_model_settings.comparisons,
83
+ )
94
84
 
95
85
  # batch together fixed probabilities rather than keep hold of the bools
96
86
  self.training_fixed_probabilities: set[str] = {
@@ -104,19 +94,16 @@ class EMTrainingSession:
104
94
  }
105
95
 
106
96
  # Remove comparison columns which are either 'used up' by the blocking rules
107
- # or alternatively, if the user has manually provided a list to remove,
108
- # use this instead
109
- if not comparisons_to_deactivate:
110
- comparisons_to_deactivate = []
111
- br_cols = get_columns_used_from_sql(
112
- blocking_rule_for_training.blocking_rule_sql,
113
- self.db_api.sql_dialect.sqlglot_name,
114
- )
115
- for cc in core_model_settings.comparisons:
116
- cc_cols = cc._input_columns_used_by_case_statement
117
- cc_cols = [c.input_name for c in cc_cols]
118
- if set(br_cols).intersection(cc_cols):
119
- comparisons_to_deactivate.append(cc)
97
+ comparisons_to_deactivate = []
98
+ br_cols = get_columns_used_from_sql(
99
+ blocking_rule_for_training.blocking_rule_sql,
100
+ self.db_api.sql_dialect.sqlglot_name,
101
+ )
102
+ for cc in core_model_settings.comparisons:
103
+ cc_cols = cc._input_columns_used_by_case_statement
104
+ cc_cols = [c.input_name for c in cc_cols]
105
+ if set(br_cols).intersection(cc_cols):
106
+ comparisons_to_deactivate.append(cc)
120
107
  cc_names_to_deactivate = [
121
108
  cc.output_column_name for cc in comparisons_to_deactivate
122
109
  ]
@@ -74,7 +74,7 @@ class Linker:
74
74
  self,
75
75
  input_table_or_tables: str | list[str],
76
76
  settings: SettingsCreator | dict[str, Any] | Path | str,
77
- database_api: DatabaseAPISubClass,
77
+ db_api: DatabaseAPISubClass,
78
78
  set_up_basic_logging: bool = True,
79
79
  input_table_aliases: str | list[str] | None = None,
80
80
  validate_settings: bool = True,
@@ -112,10 +112,12 @@ class Linker:
112
112
  database) for link_only or link_and_dedupe. For some linkers, such as
113
113
  the DuckDBLinker and the SparkLinker, it's also possible to pass in
114
114
  dataframes (Pandas and Spark respectively) rather than strings.
115
- settings_dict (dict | Path, optional): A Splink settings dictionary, or a
116
- path to a json defining a settingss dictionary or pre-trained model.
117
- If not provided when the object is created, can later be added using
118
- `linker.load_settings()` or `linker.load_model()` Defaults to None.
115
+ settings_dict (dict | Path | str): A Splink settings dictionary,
116
+ or a path (either as a pathlib.Path object, or a string) to a json file
117
+ defining a settings dictionary or pre-trained model.
118
+ db_api (DatabaseAPI): A `DatabaseAPI` object, which manages interactions
119
+ with the database. You can import these for use from
120
+ `splink.backends.{your_backend}`
119
121
  set_up_basic_logging (bool, optional): If true, sets ups up basic logging
120
122
  so that Splink sends messages at INFO level to stdout. Defaults to True.
121
123
  input_table_aliases (Union[str, list], optional): Labels assigned to
@@ -133,7 +135,7 @@ class Linker:
133
135
  splink_logger = logging.getLogger("splink")
134
136
  splink_logger.setLevel(logging.INFO)
135
137
 
136
- self._db_api = database_api
138
+ self._db_api = db_api
137
139
 
138
140
  # TODO: temp hack for compat
139
141
  self._intermediate_table_cache: CacheDictWithLogging = (
@@ -154,9 +156,7 @@ class Linker:
154
156
  # or overwrite it with the db api dialect?
155
157
  # Maybe overwrite it here and incompatibilities have to be dealt with
156
158
  # by comparisons/ blocking rules etc??
157
- self._settings_obj = settings_creator.get_settings(
158
- database_api.sql_dialect.name
159
- )
159
+ self._settings_obj = settings_creator.get_settings(db_api.sql_dialect.name)
160
160
 
161
161
  # TODO: Add test of what happens if the db_api is for a different backend
162
162
  # to the sql_dialect set in the settings dict