splink 4.0.0.dev5__tar.gz → 4.0.0.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/PKG-INFO +1 -1
  2. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/pyproject.toml +1 -1
  3. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/__init__.py +1 -1
  4. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/accuracy.py +24 -21
  5. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/cluster_studio.py +7 -7
  6. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/connected_components.py +6 -6
  7. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/edge_metrics.py +6 -6
  8. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/estimate_u.py +2 -1
  9. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/find_brs_with_comparison_counts_below_threshold.py +2 -2
  10. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/labelling_tool.py +2 -2
  11. splink-4.0.0.dev6/splink/internals/linker.py +759 -0
  12. splink-4.0.0.dev6/splink/internals/linker_components/clustering.py +284 -0
  13. splink-4.0.0.dev6/splink/internals/linker_components/evaluation.py +389 -0
  14. splink-4.0.0.dev6/splink/internals/linker_components/inference.py +513 -0
  15. splink-4.0.0.dev6/splink/internals/linker_components/misc.py +85 -0
  16. splink-4.0.0.dev6/splink/internals/linker_components/table_management.py +206 -0
  17. splink-4.0.0.dev6/splink/internals/linker_components/training.py +444 -0
  18. splink-4.0.0.dev6/splink/internals/linker_components/visualisations.py +360 -0
  19. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/m_from_labels.py +5 -2
  20. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/m_training.py +5 -2
  21. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/match_weights_histogram.py +10 -3
  22. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/optimise_cost_of_brs.py +2 -3
  23. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/term_frequencies.py +2 -2
  24. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/unlinkables.py +1 -1
  25. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/vertically_concatenate.py +2 -2
  26. splink-4.0.0.dev5/splink/internals/linker.py +0 -2835
  27. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/LICENSE +0 -0
  28. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/README.md +0 -0
  29. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/backends/spark.py +0 -0
  30. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/backends/sqlite.py +0 -0
  31. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/blocking_analysis.py +0 -0
  32. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/comparison_level_library.py +0 -0
  33. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/comparison_library.py +0 -0
  34. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/comparison_template_library.py +0 -0
  35. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/datasets.py +0 -0
  36. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/exploratory.py +0 -0
  37. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/__init__.py +0 -0
  38. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/__init__.py +0 -0
  39. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/athena_helpers/__init__.py +0 -0
  40. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/athena_helpers/athena_transforms.py +0 -0
  41. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/athena_helpers/athena_utils.py +0 -0
  42. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/linker.py +0 -0
  43. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/block_from_labels.py +0 -0
  44. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking.py +0 -0
  45. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking_analysis.py +0 -0
  46. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking_rule_creator.py +0 -0
  47. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking_rule_creator_utils.py +0 -0
  48. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking_rule_library.py +0 -0
  49. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/cache_dict_with_logging.py +0 -0
  50. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/charts.py +0 -0
  51. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/column_expression.py +0 -0
  52. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison.py +0 -0
  53. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_creator.py +0 -0
  54. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_helpers.py +0 -0
  55. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level.py +0 -0
  56. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level_composition.py +0 -0
  57. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level_creator.py +0 -0
  58. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level_library.py +0 -0
  59. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level_sql.py +0 -0
  60. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_library.py +0 -0
  61. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_template_library.py +0 -0
  62. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_vector_distribution.py +0 -0
  63. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_vector_values.py +0 -0
  64. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/completeness.py +0 -0
  65. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/constants.py +0 -0
  66. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/cost_of_blocking_rules.py +0 -0
  67. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/database_api.py +0 -0
  68. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/databricks/__init__.py +0 -0
  69. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/databricks/enable_splink.py +0 -0
  70. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/datasets/__init__.py +0 -0
  71. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/default_from_jsonschema.py +0 -0
  72. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/dialects.py +0 -0
  73. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/__init__.py +0 -0
  74. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/database_api.py +0 -0
  75. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/dataframe.py +0 -0
  76. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/duckdb_helpers/__init__.py +0 -0
  77. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/duckdb_helpers/duckdb_helpers.py +0 -0
  78. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/em_training_session.py +0 -0
  79. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/exceptions.py +0 -0
  80. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/expectation_maximisation.py +0 -0
  81. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/DEPENDENCY_LICENSES.txt +0 -0
  82. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/accuracy_chart.json +0 -0
  83. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/blocking_rule_generated_comparisons.json +0 -0
  84. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/comparator_score_chart.json +0 -0
  85. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/comparator_score_threshold_chart.json +0 -0
  86. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/completeness.json +0 -0
  87. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/m_u_parameters_interactive_history.json +0 -0
  88. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/match_weight_histogram.json +0 -0
  89. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/match_weights_interactive_history.json +0 -0
  90. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/match_weights_waterfall.json +0 -0
  91. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/missingness.json +0 -0
  92. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/parameter_estimate_comparisons.json +0 -0
  93. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/phonetic_match_chart.json +0 -0
  94. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/precision_recall.json +0 -0
  95. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/probability_two_random_records_match_iteration.json +0 -0
  96. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/profile_data.json +0 -0
  97. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/roc.json +0 -0
  98. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/tf_adjustment_chart.json +0 -0
  99. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/threshold_selection_tool.json +0 -0
  100. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/unlinkables_chart_def.json +0 -0
  101. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/d3@7.8.5 +0 -0
  102. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/stdlib.js@5.8.3 +0 -0
  103. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/vega-embed@6.20.2 +0 -0
  104. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/vega-lite@5.2.0 +0 -0
  105. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/vega@5.21.0 +0 -0
  106. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/labelling_tool/slt.js +0 -0
  107. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/labelling_tool/template.j2 +0 -0
  108. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/settings_jsonschema.json +0 -0
  109. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_classic.jar +0 -0
  110. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_spark3.3.jar +0 -0
  111. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.1_spark3.x.jar +0 -0
  112. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_cluster_studio/cluster_template.j2 +0 -0
  113. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_cluster_studio/custom.css +0 -0
  114. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_comparison_viewer/custom.css +0 -0
  115. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_comparison_viewer/template.j2 +0 -0
  116. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_vis_utils/splink_vis_utils.js +0 -0
  117. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/templates/single_chart_template.html +0 -0
  118. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/find_matches_to_new_records.py +0 -0
  119. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/graph_metrics.py +0 -0
  120. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/input_column.py +0 -0
  121. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/logging_messages.py +0 -0
  122. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/lower_id_on_lhs.py +0 -0
  123. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/m_u_records_to_parameters.py +0 -0
  124. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/match_key_analysis.py +0 -0
  125. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/misc.py +0 -0
  126. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/parse_sql.py +0 -0
  127. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/pipeline.py +0 -0
  128. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/postgres/__init__.py +0 -0
  129. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/postgres/database_api.py +0 -0
  130. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/postgres/dataframe.py +0 -0
  131. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/predict.py +0 -0
  132. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/profile_data.py +0 -0
  133. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings.py +0 -0
  134. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_creator.py +0 -0
  135. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/__init__.py +0 -0
  136. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/log_invalid_columns.py +0 -0
  137. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/settings_column_cleaner.py +0 -0
  138. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/settings_validation_log_strings.py +0 -0
  139. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/valid_types.py +0 -0
  140. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/__init__.py +0 -0
  141. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/database_api.py +0 -0
  142. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/dataframe.py +0 -0
  143. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/jar_location.py +0 -0
  144. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/spark_helpers/__init__.py +0 -0
  145. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/spark_helpers/custom_spark_dialect.py +0 -0
  146. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/splink_comparison_viewer.py +0 -0
  147. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/splink_dataframe.py +0 -0
  148. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/sql_transform.py +0 -0
  149. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/sqlite/__init__.py +0 -0
  150. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/sqlite/database_api.py +0 -0
  151. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/sqlite/dataframe.py +0 -0
  152. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/unique_id_concat.py +0 -0
  153. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/validate_jsonschema.py +0 -0
  154. {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/waterfall_chart.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: splink
3
- Version: 4.0.0.dev5
3
+ Version: 4.0.0.dev6
4
4
  Summary: Fast probabilistic data linkage at scale
5
5
  Home-page: https://github.com/moj-analytical-services/splink
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "splink"
3
- version = "4.0.0.dev5"
3
+ version = "4.0.0.dev6"
4
4
  description = "Fast probabilistic data linkage at scale"
5
5
  authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"]
6
6
  license = "MIT"
@@ -44,7 +44,7 @@ def __getattr__(name):
44
44
  raise AttributeError(f"module 'splink' has no attribute '{name}'") from None
45
45
 
46
46
 
47
- __version__ = "4.0.0.dev5"
47
+ __version__ = "4.0.0.dev6"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from copy import deepcopy
4
- from typing import TYPE_CHECKING
4
+ from typing import TYPE_CHECKING, Optional
5
5
 
6
6
  from splink.internals.block_from_labels import block_from_labels
7
7
  from splink.internals.blocking import BlockingRule
@@ -307,8 +307,11 @@ def _select_found_by_blocking_rules(linker: "Linker") -> str:
307
307
 
308
308
 
309
309
  def truth_space_table_from_labels_table(
310
- linker, labels_tablename, threshold_actual=0.5, match_weight_round_to_nearest=None
311
- ):
310
+ linker: Linker,
311
+ labels_tablename: str,
312
+ threshold_actual: float = 0.5,
313
+ match_weight_round_to_nearest: Optional[float] = None,
314
+ ) -> SplinkDataFrame:
312
315
  pipeline = CTEPipeline()
313
316
 
314
317
  nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
@@ -323,7 +326,7 @@ def truth_space_table_from_labels_table(
323
326
  )
324
327
  pipeline.enqueue_list_of_sqls(sqls)
325
328
 
326
- df_truth_space_table = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
329
+ df_truth_space_table = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
327
330
 
328
331
  return df_truth_space_table
329
332
 
@@ -356,7 +359,7 @@ def truth_space_table_from_labels_column(
356
359
  """
357
360
 
358
361
  pipeline.enqueue_sql(sql, "__splink__cartesian_product")
359
- cartesian_count = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
362
+ cartesian_count = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
360
363
  row_count_df = cartesian_count.as_record_dict()
361
364
  cartesian_count.drop_table_from_database_and_remove_from_cache()
362
365
 
@@ -393,7 +396,7 @@ def truth_space_table_from_labels_column(
393
396
  )
394
397
  pipeline.enqueue_list_of_sqls(sqls)
395
398
 
396
- df_truth_space_table = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
399
+ df_truth_space_table = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
397
400
 
398
401
  return df_truth_space_table
399
402
 
@@ -439,12 +442,12 @@ def predictions_from_sample_of_pairwise_labels_sql(linker, labels_tablename):
439
442
 
440
443
 
441
444
  def prediction_errors_from_labels_table(
442
- linker,
443
- labels_tablename,
444
- include_false_positives=True,
445
- include_false_negatives=True,
446
- threshold=0.5,
447
- ):
445
+ linker: Linker,
446
+ labels_tablename: str,
447
+ include_false_positives: bool = True,
448
+ include_false_negatives: bool = True,
449
+ threshold: float = 0.5,
450
+ ) -> SplinkDataFrame:
448
451
  pipeline = CTEPipeline()
449
452
  nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
450
453
  pipeline = CTEPipeline([nodes_with_tf])
@@ -486,7 +489,7 @@ def prediction_errors_from_labels_table(
486
489
 
487
490
  pipeline.enqueue_sql(sql, "__splink__labels_with_fp_fn_status")
488
491
 
489
- return linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
492
+ return linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
490
493
 
491
494
 
492
495
  def _predict_from_label_column_sql(linker, label_colname):
@@ -509,18 +512,18 @@ def _predict_from_label_column_sql(linker, label_colname):
509
512
  settings._additional_column_names_to_retain.append(label_colname)
510
513
 
511
514
  # Now we want to create predictions
512
- df_predict = linker.predict()
515
+ df_predict = linker.inference.predict()
513
516
 
514
517
  return df_predict
515
518
 
516
519
 
517
520
  def prediction_errors_from_label_column(
518
- linker,
519
- label_colname,
520
- include_false_positives=True,
521
- include_false_negatives=True,
522
- threshold=0.5,
523
- ):
521
+ linker: Linker,
522
+ label_colname: str,
523
+ include_false_positives: bool = True,
524
+ include_false_negatives: bool = True,
525
+ threshold: float = 0.5,
526
+ ) -> SplinkDataFrame:
524
527
  df_predict = _predict_from_label_column_sql(
525
528
  linker,
526
529
  label_colname,
@@ -577,6 +580,6 @@ def prediction_errors_from_label_column(
577
580
 
578
581
  pipeline.enqueue_sql(sql, "__splink__predictions_from_label_column_fp_fn_only")
579
582
 
580
- predictions = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
583
+ predictions = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
581
584
 
582
585
  return predictions
@@ -63,7 +63,7 @@ def df_clusters_as_records(
63
63
  sql = _clusters_sql(df_clustered_nodes, cluster_ids)
64
64
  pipeline = CTEPipeline()
65
65
  pipeline.enqueue_sql(sql, "__splink__scs_clusters")
66
- df_clusters = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
66
+ df_clusters = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
67
67
 
68
68
  return df_clusters.as_record_dict()
69
69
 
@@ -107,7 +107,7 @@ def create_df_nodes(
107
107
  pipeline = CTEPipeline()
108
108
  sql = _nodes_sql(df_clustered_nodes, cluster_ids)
109
109
  pipeline.enqueue_sql(sql, "__splink__scs_nodes")
110
- df_nodes = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
110
+ df_nodes = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
111
111
 
112
112
  return df_nodes
113
113
 
@@ -151,7 +151,7 @@ def df_edges_as_records(
151
151
  sql = _edges_sql(linker, df_predicted_edges, df_nodes)
152
152
  pipeline = CTEPipeline()
153
153
  pipeline.enqueue_sql(sql, "__splink__scs_edges")
154
- df_edges = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
154
+ df_edges = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
155
155
 
156
156
  return df_edges.as_record_dict()
157
157
 
@@ -168,7 +168,7 @@ def _get_random_cluster_ids(
168
168
  """
169
169
  pipeline = CTEPipeline()
170
170
  pipeline.enqueue_sql(sql, "__splink__cluster_count")
171
- df_cluster_count = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
171
+ df_cluster_count = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
172
172
  cluster_count = df_cluster_count.as_record_dict()[0]["count"]
173
173
  df_cluster_count.drop_table_from_database_and_remove_from_cache()
174
174
 
@@ -192,7 +192,7 @@ def _get_random_cluster_ids(
192
192
  """
193
193
  pipeline = CTEPipeline()
194
194
  pipeline.enqueue_sql(sql, "__splink__df_concat_with_tf_sample")
195
- df_sample = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
195
+ df_sample = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
196
196
 
197
197
  return [r["cluster_id"] for r in df_sample.as_record_dict()]
198
198
 
@@ -234,7 +234,7 @@ def _get_cluster_id_of_each_size(
234
234
  """
235
235
 
236
236
  pipeline.enqueue_sql(sql, "__splink__cluster_count_row_numbered")
237
- df_cluster_sample_with_size = linker.db_api.sql_pipeline_to_splink_dataframe(
237
+ df_cluster_sample_with_size = linker._db_api.sql_pipeline_to_splink_dataframe(
238
238
  pipeline
239
239
  )
240
240
 
@@ -285,7 +285,7 @@ def _get_lowest_density_clusters(
285
285
  """
286
286
 
287
287
  pipeline.enqueue_sql(sql, "__splink__lowest_density_clusters")
288
- df_lowest_density_clusters = linker.db_api.sql_pipeline_to_splink_dataframe(
288
+ df_lowest_density_clusters = linker._db_api.sql_pipeline_to_splink_dataframe(
289
289
  pipeline
290
290
  )
291
291
 
@@ -355,7 +355,7 @@ def _cc_create_unique_id_cols(
355
355
  """
356
356
  pipeline = CTEPipeline()
357
357
  pipeline.enqueue_sql(sql, "__splink__df_connected_components_df")
358
- return linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
358
+ return linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
359
359
 
360
360
 
361
361
  def _exit_query(
@@ -453,7 +453,7 @@ def solve_connected_components(
453
453
  pipeline.enqueue_sql(sql, "nodes")
454
454
  sql = _cc_generate_neighbours_representation()
455
455
  pipeline.enqueue_sql(sql, "__splink__df_neighbours")
456
- neighbours = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
456
+ neighbours = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
457
457
 
458
458
  # Create our initial representatives table
459
459
  pipeline = CTEPipeline([neighbours])
@@ -465,7 +465,7 @@ def solve_connected_components(
465
465
  # Execute if we have no batching, otherwise add it to our batched process
466
466
  pipeline.enqueue_sql(sql, "__splink__df_representatives")
467
467
 
468
- representatives = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
468
+ representatives = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
469
469
  prev_representatives_table = representatives
470
470
 
471
471
  # Loop while our representative table still has unsettled nodes
@@ -500,7 +500,7 @@ def solve_connected_components(
500
500
  repr_name,
501
501
  )
502
502
 
503
- representatives = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
503
+ representatives = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
504
504
 
505
505
  pipeline = CTEPipeline()
506
506
  # Update table reference
@@ -512,7 +512,7 @@ def solve_connected_components(
512
512
 
513
513
  pipeline.enqueue_sql(sql, "__splink__df_root_rows")
514
514
 
515
- root_rows_df = linker.db_api.sql_pipeline_to_splink_dataframe(
515
+ root_rows_df = linker._db_api.sql_pipeline_to_splink_dataframe(
516
516
  pipeline, use_cache=False
517
517
  )
518
518
 
@@ -540,6 +540,6 @@ def solve_connected_components(
540
540
  )
541
541
  pipeline = CTEPipeline([representatives])
542
542
  pipeline.enqueue_sql(exit_query, "__splink__df_representatives")
543
- representatives = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
543
+ representatives = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
544
544
 
545
545
  return representatives
@@ -68,7 +68,7 @@ def compute_basic_edge_metrics(
68
68
  )
69
69
  pipeline.enqueue_sql(**sql_info)
70
70
 
71
- df_truncated_edges = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
71
+ df_truncated_edges = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
72
72
  return df_truncated_edges
73
73
 
74
74
 
@@ -96,13 +96,13 @@ def compute_igraph_metrics(
96
96
  # this is how igraph deals with nodes
97
97
  sql_infos = _node_mapping_table_sql(df_node_metrics)
98
98
  pipeline.enqueue_list_of_sqls(sql_infos)
99
- df_node_mappings = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
99
+ df_node_mappings = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
100
100
 
101
101
  # we keep only edges at or above relevant threshold
102
102
  pipeline = CTEPipeline()
103
103
  sql_info = _truncated_edges_sql(df_predict, threshold_match_probability)
104
104
  pipeline.enqueue_sql(**sql_info)
105
- df_truncated_edges = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
105
+ df_truncated_edges = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
106
106
 
107
107
  # we map the truncated edges to the integer encoding for nodes above,
108
108
  # keeping only the list of endpoints
@@ -114,7 +114,7 @@ def compute_igraph_metrics(
114
114
  composite_uid_edges_r,
115
115
  )
116
116
  pipeline.enqueue_sql(**sql_info)
117
- edges_for_igraph = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
117
+ edges_for_igraph = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
118
118
  # we will need to manually register a table, so we use the hash from this table
119
119
  igraph_edges_hash = edges_for_igraph.physical_name[-9:]
120
120
  # NB: for large data we may have to revise this and process in chunks
@@ -124,7 +124,7 @@ def compute_igraph_metrics(
124
124
  igraph_df = ig.Graph.DataFrame(df_edges_for_igraph, directed=False)
125
125
  bridges_indices = igraph_df.bridges()
126
126
  df_bridges_pd = df_edges_for_igraph.iloc[bridges_indices, :]
127
- df_bridges = linker.register_table(
127
+ df_bridges = linker.table_management.register_table(
128
128
  df_bridges_pd, f"__splink__bridges_{igraph_edges_hash}"
129
129
  )
130
130
  # map our bridge edges back to the original node labelling
@@ -139,5 +139,5 @@ def compute_igraph_metrics(
139
139
  composite_uid_edges_r,
140
140
  )
141
141
  pipeline.enqueue_sql(**sql_info)
142
- df_edge_metrics = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
142
+ df_edge_metrics = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
143
143
  return df_edge_metrics
@@ -74,7 +74,7 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non
74
74
  settings_obj._retain_matching_columns = False
75
75
  settings_obj._retain_intermediate_calculation_columns = False
76
76
 
77
- db_api = training_linker.db_api
77
+ db_api = training_linker._db_api
78
78
 
79
79
  for cc in settings_obj.comparisons:
80
80
  for cl in cc.comparison_levels:
@@ -211,6 +211,7 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non
211
211
  ]
212
212
 
213
213
  m_u_records_lookup = m_u_records_to_lookup_dict(m_u_records)
214
+
214
215
  for c in original_settings_obj.comparisons:
215
216
  for cl in c._comparison_levels_excluding_null:
216
217
  append_u_probability_to_comparison_level_trained_probabilities(
@@ -158,13 +158,13 @@ def _search_tree_for_blocking_rules_below_threshold_count(
158
158
  if len(current_combination) == len(all_columns):
159
159
  return results # All fields included, meaning we're at a leaf so exit recursion
160
160
 
161
- br = _generate_blocking_rule(linker.db_api, current_combination)
161
+ br = _generate_blocking_rule(linker._db_api, current_combination)
162
162
 
163
163
  comparison_count = _count_comparisons_generated_from_blocking_rule(
164
164
  splink_df_dict=linker._input_tables_dict,
165
165
  blocking_rule=br,
166
166
  link_type=linker._settings_obj._link_type,
167
- db_api=linker.db_api,
167
+ db_api=linker._db_api,
168
168
  compute_post_filter_count=False,
169
169
  source_dataset_input_column=linker._settings_obj.column_info_settings.source_dataset_input_column,
170
170
  unique_id_input_column=linker._settings_obj.column_info_settings.unique_id_input_column,
@@ -50,9 +50,9 @@ def generate_labelling_tool_comparisons(
50
50
  """
51
51
 
52
52
  pipeline.enqueue_sql(sql, "__splink__df_labelling_tool_record")
53
- splink_df = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
53
+ splink_df = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
54
54
 
55
- matches = linker.find_matches_to_new_records(
55
+ matches = linker.inference.find_matches_to_new_records(
56
56
  splink_df.physical_name, match_weight_threshold=match_weight_threshold
57
57
  )
58
58