splink 4.0.0.dev5__tar.gz → 4.0.0.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/PKG-INFO +1 -1
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/pyproject.toml +1 -1
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/__init__.py +1 -1
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/accuracy.py +24 -21
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/cluster_studio.py +7 -7
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/connected_components.py +6 -6
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/edge_metrics.py +6 -6
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/estimate_u.py +2 -1
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/find_brs_with_comparison_counts_below_threshold.py +2 -2
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/labelling_tool.py +2 -2
- splink-4.0.0.dev6/splink/internals/linker.py +759 -0
- splink-4.0.0.dev6/splink/internals/linker_components/clustering.py +284 -0
- splink-4.0.0.dev6/splink/internals/linker_components/evaluation.py +389 -0
- splink-4.0.0.dev6/splink/internals/linker_components/inference.py +513 -0
- splink-4.0.0.dev6/splink/internals/linker_components/misc.py +85 -0
- splink-4.0.0.dev6/splink/internals/linker_components/table_management.py +206 -0
- splink-4.0.0.dev6/splink/internals/linker_components/training.py +444 -0
- splink-4.0.0.dev6/splink/internals/linker_components/visualisations.py +360 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/m_from_labels.py +5 -2
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/m_training.py +5 -2
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/match_weights_histogram.py +10 -3
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/optimise_cost_of_brs.py +2 -3
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/term_frequencies.py +2 -2
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/unlinkables.py +1 -1
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/vertically_concatenate.py +2 -2
- splink-4.0.0.dev5/splink/internals/linker.py +0 -2835
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/LICENSE +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/README.md +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/backends/spark.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/backends/sqlite.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/blocking_analysis.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/comparison_level_library.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/comparison_library.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/comparison_template_library.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/datasets.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/exploratory.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/athena_helpers/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/athena_helpers/athena_transforms.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/athena_helpers/athena_utils.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/athena/linker.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/block_from_labels.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking_analysis.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking_rule_creator.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking_rule_creator_utils.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/blocking_rule_library.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/cache_dict_with_logging.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/charts.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/column_expression.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_creator.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_helpers.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level_composition.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level_creator.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level_library.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_level_sql.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_library.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_template_library.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_vector_distribution.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/comparison_vector_values.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/completeness.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/constants.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/cost_of_blocking_rules.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/database_api.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/databricks/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/databricks/enable_splink.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/datasets/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/default_from_jsonschema.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/dialects.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/database_api.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/dataframe.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/duckdb_helpers/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/duckdb/duckdb_helpers/duckdb_helpers.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/em_training_session.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/exceptions.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/expectation_maximisation.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/DEPENDENCY_LICENSES.txt +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/accuracy_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/blocking_rule_generated_comparisons.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/comparator_score_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/comparator_score_threshold_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/completeness.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/m_u_parameters_interactive_history.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/match_weight_histogram.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/match_weights_interactive_history.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/match_weights_waterfall.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/missingness.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/parameter_estimate_comparisons.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/phonetic_match_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/precision_recall.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/probability_two_random_records_match_iteration.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/profile_data.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/roc.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/tf_adjustment_chart.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/threshold_selection_tool.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/chart_defs/unlinkables_chart_def.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/d3@7.8.5 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/stdlib.js@5.8.3 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/vega-embed@6.20.2 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/vega-lite@5.2.0 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/external_js/vega@5.21.0 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/labelling_tool/slt.js +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/labelling_tool/template.j2 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/settings_jsonschema.json +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_classic.jar +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_spark3.3.jar +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.1_spark3.x.jar +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_cluster_studio/cluster_template.j2 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_cluster_studio/custom.css +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_comparison_viewer/custom.css +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_comparison_viewer/template.j2 +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/splink_vis_utils/splink_vis_utils.js +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/files/templates/single_chart_template.html +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/find_matches_to_new_records.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/graph_metrics.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/input_column.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/logging_messages.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/lower_id_on_lhs.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/m_u_records_to_parameters.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/match_key_analysis.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/misc.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/parse_sql.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/pipeline.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/postgres/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/postgres/database_api.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/postgres/dataframe.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/predict.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/profile_data.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_creator.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/log_invalid_columns.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/settings_column_cleaner.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/settings_validation_log_strings.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/settings_validation/valid_types.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/database_api.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/dataframe.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/jar_location.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/spark_helpers/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/spark/spark_helpers/custom_spark_dialect.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/splink_comparison_viewer.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/splink_dataframe.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/sql_transform.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/sqlite/__init__.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/sqlite/database_api.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/sqlite/dataframe.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/unique_id_concat.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/validate_jsonschema.py +0 -0
- {splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/waterfall_chart.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "splink"
|
|
3
|
-
version = "4.0.0.
|
|
3
|
+
version = "4.0.0.dev6"
|
|
4
4
|
description = "Fast probabilistic data linkage at scale"
|
|
5
5
|
authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"]
|
|
6
6
|
license = "MIT"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from copy import deepcopy
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
5
|
|
|
6
6
|
from splink.internals.block_from_labels import block_from_labels
|
|
7
7
|
from splink.internals.blocking import BlockingRule
|
|
@@ -307,8 +307,11 @@ def _select_found_by_blocking_rules(linker: "Linker") -> str:
|
|
|
307
307
|
|
|
308
308
|
|
|
309
309
|
def truth_space_table_from_labels_table(
|
|
310
|
-
linker
|
|
311
|
-
|
|
310
|
+
linker: Linker,
|
|
311
|
+
labels_tablename: str,
|
|
312
|
+
threshold_actual: float = 0.5,
|
|
313
|
+
match_weight_round_to_nearest: Optional[float] = None,
|
|
314
|
+
) -> SplinkDataFrame:
|
|
312
315
|
pipeline = CTEPipeline()
|
|
313
316
|
|
|
314
317
|
nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
|
|
@@ -323,7 +326,7 @@ def truth_space_table_from_labels_table(
|
|
|
323
326
|
)
|
|
324
327
|
pipeline.enqueue_list_of_sqls(sqls)
|
|
325
328
|
|
|
326
|
-
df_truth_space_table = linker.
|
|
329
|
+
df_truth_space_table = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
327
330
|
|
|
328
331
|
return df_truth_space_table
|
|
329
332
|
|
|
@@ -356,7 +359,7 @@ def truth_space_table_from_labels_column(
|
|
|
356
359
|
"""
|
|
357
360
|
|
|
358
361
|
pipeline.enqueue_sql(sql, "__splink__cartesian_product")
|
|
359
|
-
cartesian_count = linker.
|
|
362
|
+
cartesian_count = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
360
363
|
row_count_df = cartesian_count.as_record_dict()
|
|
361
364
|
cartesian_count.drop_table_from_database_and_remove_from_cache()
|
|
362
365
|
|
|
@@ -393,7 +396,7 @@ def truth_space_table_from_labels_column(
|
|
|
393
396
|
)
|
|
394
397
|
pipeline.enqueue_list_of_sqls(sqls)
|
|
395
398
|
|
|
396
|
-
df_truth_space_table = linker.
|
|
399
|
+
df_truth_space_table = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
397
400
|
|
|
398
401
|
return df_truth_space_table
|
|
399
402
|
|
|
@@ -439,12 +442,12 @@ def predictions_from_sample_of_pairwise_labels_sql(linker, labels_tablename):
|
|
|
439
442
|
|
|
440
443
|
|
|
441
444
|
def prediction_errors_from_labels_table(
|
|
442
|
-
linker,
|
|
443
|
-
labels_tablename,
|
|
444
|
-
include_false_positives=True,
|
|
445
|
-
include_false_negatives=True,
|
|
446
|
-
threshold=0.5,
|
|
447
|
-
):
|
|
445
|
+
linker: Linker,
|
|
446
|
+
labels_tablename: str,
|
|
447
|
+
include_false_positives: bool = True,
|
|
448
|
+
include_false_negatives: bool = True,
|
|
449
|
+
threshold: float = 0.5,
|
|
450
|
+
) -> SplinkDataFrame:
|
|
448
451
|
pipeline = CTEPipeline()
|
|
449
452
|
nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
|
|
450
453
|
pipeline = CTEPipeline([nodes_with_tf])
|
|
@@ -486,7 +489,7 @@ def prediction_errors_from_labels_table(
|
|
|
486
489
|
|
|
487
490
|
pipeline.enqueue_sql(sql, "__splink__labels_with_fp_fn_status")
|
|
488
491
|
|
|
489
|
-
return linker.
|
|
492
|
+
return linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
490
493
|
|
|
491
494
|
|
|
492
495
|
def _predict_from_label_column_sql(linker, label_colname):
|
|
@@ -509,18 +512,18 @@ def _predict_from_label_column_sql(linker, label_colname):
|
|
|
509
512
|
settings._additional_column_names_to_retain.append(label_colname)
|
|
510
513
|
|
|
511
514
|
# Now we want to create predictions
|
|
512
|
-
df_predict = linker.predict()
|
|
515
|
+
df_predict = linker.inference.predict()
|
|
513
516
|
|
|
514
517
|
return df_predict
|
|
515
518
|
|
|
516
519
|
|
|
517
520
|
def prediction_errors_from_label_column(
|
|
518
|
-
linker,
|
|
519
|
-
label_colname,
|
|
520
|
-
include_false_positives=True,
|
|
521
|
-
include_false_negatives=True,
|
|
522
|
-
threshold=0.5,
|
|
523
|
-
):
|
|
521
|
+
linker: Linker,
|
|
522
|
+
label_colname: str,
|
|
523
|
+
include_false_positives: bool = True,
|
|
524
|
+
include_false_negatives: bool = True,
|
|
525
|
+
threshold: float = 0.5,
|
|
526
|
+
) -> SplinkDataFrame:
|
|
524
527
|
df_predict = _predict_from_label_column_sql(
|
|
525
528
|
linker,
|
|
526
529
|
label_colname,
|
|
@@ -577,6 +580,6 @@ def prediction_errors_from_label_column(
|
|
|
577
580
|
|
|
578
581
|
pipeline.enqueue_sql(sql, "__splink__predictions_from_label_column_fp_fn_only")
|
|
579
582
|
|
|
580
|
-
predictions = linker.
|
|
583
|
+
predictions = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
581
584
|
|
|
582
585
|
return predictions
|
|
@@ -63,7 +63,7 @@ def df_clusters_as_records(
|
|
|
63
63
|
sql = _clusters_sql(df_clustered_nodes, cluster_ids)
|
|
64
64
|
pipeline = CTEPipeline()
|
|
65
65
|
pipeline.enqueue_sql(sql, "__splink__scs_clusters")
|
|
66
|
-
df_clusters = linker.
|
|
66
|
+
df_clusters = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
67
67
|
|
|
68
68
|
return df_clusters.as_record_dict()
|
|
69
69
|
|
|
@@ -107,7 +107,7 @@ def create_df_nodes(
|
|
|
107
107
|
pipeline = CTEPipeline()
|
|
108
108
|
sql = _nodes_sql(df_clustered_nodes, cluster_ids)
|
|
109
109
|
pipeline.enqueue_sql(sql, "__splink__scs_nodes")
|
|
110
|
-
df_nodes = linker.
|
|
110
|
+
df_nodes = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
111
111
|
|
|
112
112
|
return df_nodes
|
|
113
113
|
|
|
@@ -151,7 +151,7 @@ def df_edges_as_records(
|
|
|
151
151
|
sql = _edges_sql(linker, df_predicted_edges, df_nodes)
|
|
152
152
|
pipeline = CTEPipeline()
|
|
153
153
|
pipeline.enqueue_sql(sql, "__splink__scs_edges")
|
|
154
|
-
df_edges = linker.
|
|
154
|
+
df_edges = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
155
155
|
|
|
156
156
|
return df_edges.as_record_dict()
|
|
157
157
|
|
|
@@ -168,7 +168,7 @@ def _get_random_cluster_ids(
|
|
|
168
168
|
"""
|
|
169
169
|
pipeline = CTEPipeline()
|
|
170
170
|
pipeline.enqueue_sql(sql, "__splink__cluster_count")
|
|
171
|
-
df_cluster_count = linker.
|
|
171
|
+
df_cluster_count = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
172
172
|
cluster_count = df_cluster_count.as_record_dict()[0]["count"]
|
|
173
173
|
df_cluster_count.drop_table_from_database_and_remove_from_cache()
|
|
174
174
|
|
|
@@ -192,7 +192,7 @@ def _get_random_cluster_ids(
|
|
|
192
192
|
"""
|
|
193
193
|
pipeline = CTEPipeline()
|
|
194
194
|
pipeline.enqueue_sql(sql, "__splink__df_concat_with_tf_sample")
|
|
195
|
-
df_sample = linker.
|
|
195
|
+
df_sample = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
196
196
|
|
|
197
197
|
return [r["cluster_id"] for r in df_sample.as_record_dict()]
|
|
198
198
|
|
|
@@ -234,7 +234,7 @@ def _get_cluster_id_of_each_size(
|
|
|
234
234
|
"""
|
|
235
235
|
|
|
236
236
|
pipeline.enqueue_sql(sql, "__splink__cluster_count_row_numbered")
|
|
237
|
-
df_cluster_sample_with_size = linker.
|
|
237
|
+
df_cluster_sample_with_size = linker._db_api.sql_pipeline_to_splink_dataframe(
|
|
238
238
|
pipeline
|
|
239
239
|
)
|
|
240
240
|
|
|
@@ -285,7 +285,7 @@ def _get_lowest_density_clusters(
|
|
|
285
285
|
"""
|
|
286
286
|
|
|
287
287
|
pipeline.enqueue_sql(sql, "__splink__lowest_density_clusters")
|
|
288
|
-
df_lowest_density_clusters = linker.
|
|
288
|
+
df_lowest_density_clusters = linker._db_api.sql_pipeline_to_splink_dataframe(
|
|
289
289
|
pipeline
|
|
290
290
|
)
|
|
291
291
|
|
|
@@ -355,7 +355,7 @@ def _cc_create_unique_id_cols(
|
|
|
355
355
|
"""
|
|
356
356
|
pipeline = CTEPipeline()
|
|
357
357
|
pipeline.enqueue_sql(sql, "__splink__df_connected_components_df")
|
|
358
|
-
return linker.
|
|
358
|
+
return linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
359
359
|
|
|
360
360
|
|
|
361
361
|
def _exit_query(
|
|
@@ -453,7 +453,7 @@ def solve_connected_components(
|
|
|
453
453
|
pipeline.enqueue_sql(sql, "nodes")
|
|
454
454
|
sql = _cc_generate_neighbours_representation()
|
|
455
455
|
pipeline.enqueue_sql(sql, "__splink__df_neighbours")
|
|
456
|
-
neighbours = linker.
|
|
456
|
+
neighbours = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
457
457
|
|
|
458
458
|
# Create our initial representatives table
|
|
459
459
|
pipeline = CTEPipeline([neighbours])
|
|
@@ -465,7 +465,7 @@ def solve_connected_components(
|
|
|
465
465
|
# Execute if we have no batching, otherwise add it to our batched process
|
|
466
466
|
pipeline.enqueue_sql(sql, "__splink__df_representatives")
|
|
467
467
|
|
|
468
|
-
representatives = linker.
|
|
468
|
+
representatives = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
469
469
|
prev_representatives_table = representatives
|
|
470
470
|
|
|
471
471
|
# Loop while our representative table still has unsettled nodes
|
|
@@ -500,7 +500,7 @@ def solve_connected_components(
|
|
|
500
500
|
repr_name,
|
|
501
501
|
)
|
|
502
502
|
|
|
503
|
-
representatives = linker.
|
|
503
|
+
representatives = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
504
504
|
|
|
505
505
|
pipeline = CTEPipeline()
|
|
506
506
|
# Update table reference
|
|
@@ -512,7 +512,7 @@ def solve_connected_components(
|
|
|
512
512
|
|
|
513
513
|
pipeline.enqueue_sql(sql, "__splink__df_root_rows")
|
|
514
514
|
|
|
515
|
-
root_rows_df = linker.
|
|
515
|
+
root_rows_df = linker._db_api.sql_pipeline_to_splink_dataframe(
|
|
516
516
|
pipeline, use_cache=False
|
|
517
517
|
)
|
|
518
518
|
|
|
@@ -540,6 +540,6 @@ def solve_connected_components(
|
|
|
540
540
|
)
|
|
541
541
|
pipeline = CTEPipeline([representatives])
|
|
542
542
|
pipeline.enqueue_sql(exit_query, "__splink__df_representatives")
|
|
543
|
-
representatives = linker.
|
|
543
|
+
representatives = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
544
544
|
|
|
545
545
|
return representatives
|
|
@@ -68,7 +68,7 @@ def compute_basic_edge_metrics(
|
|
|
68
68
|
)
|
|
69
69
|
pipeline.enqueue_sql(**sql_info)
|
|
70
70
|
|
|
71
|
-
df_truncated_edges = linker.
|
|
71
|
+
df_truncated_edges = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
72
72
|
return df_truncated_edges
|
|
73
73
|
|
|
74
74
|
|
|
@@ -96,13 +96,13 @@ def compute_igraph_metrics(
|
|
|
96
96
|
# this is how igraph deals with nodes
|
|
97
97
|
sql_infos = _node_mapping_table_sql(df_node_metrics)
|
|
98
98
|
pipeline.enqueue_list_of_sqls(sql_infos)
|
|
99
|
-
df_node_mappings = linker.
|
|
99
|
+
df_node_mappings = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
100
100
|
|
|
101
101
|
# we keep only edges at or above relevant threshold
|
|
102
102
|
pipeline = CTEPipeline()
|
|
103
103
|
sql_info = _truncated_edges_sql(df_predict, threshold_match_probability)
|
|
104
104
|
pipeline.enqueue_sql(**sql_info)
|
|
105
|
-
df_truncated_edges = linker.
|
|
105
|
+
df_truncated_edges = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
106
106
|
|
|
107
107
|
# we map the truncated edges to the integer encoding for nodes above,
|
|
108
108
|
# keeping only the list of endpoints
|
|
@@ -114,7 +114,7 @@ def compute_igraph_metrics(
|
|
|
114
114
|
composite_uid_edges_r,
|
|
115
115
|
)
|
|
116
116
|
pipeline.enqueue_sql(**sql_info)
|
|
117
|
-
edges_for_igraph = linker.
|
|
117
|
+
edges_for_igraph = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
118
118
|
# we will need to manually register a table, so we use the hash from this table
|
|
119
119
|
igraph_edges_hash = edges_for_igraph.physical_name[-9:]
|
|
120
120
|
# NB: for large data we may have to revise this and process in chunks
|
|
@@ -124,7 +124,7 @@ def compute_igraph_metrics(
|
|
|
124
124
|
igraph_df = ig.Graph.DataFrame(df_edges_for_igraph, directed=False)
|
|
125
125
|
bridges_indices = igraph_df.bridges()
|
|
126
126
|
df_bridges_pd = df_edges_for_igraph.iloc[bridges_indices, :]
|
|
127
|
-
df_bridges = linker.register_table(
|
|
127
|
+
df_bridges = linker.table_management.register_table(
|
|
128
128
|
df_bridges_pd, f"__splink__bridges_{igraph_edges_hash}"
|
|
129
129
|
)
|
|
130
130
|
# map our bridge edges back to the original node labelling
|
|
@@ -139,5 +139,5 @@ def compute_igraph_metrics(
|
|
|
139
139
|
composite_uid_edges_r,
|
|
140
140
|
)
|
|
141
141
|
pipeline.enqueue_sql(**sql_info)
|
|
142
|
-
df_edge_metrics = linker.
|
|
142
|
+
df_edge_metrics = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
143
143
|
return df_edge_metrics
|
|
@@ -74,7 +74,7 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non
|
|
|
74
74
|
settings_obj._retain_matching_columns = False
|
|
75
75
|
settings_obj._retain_intermediate_calculation_columns = False
|
|
76
76
|
|
|
77
|
-
db_api = training_linker.
|
|
77
|
+
db_api = training_linker._db_api
|
|
78
78
|
|
|
79
79
|
for cc in settings_obj.comparisons:
|
|
80
80
|
for cl in cc.comparison_levels:
|
|
@@ -211,6 +211,7 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non
|
|
|
211
211
|
]
|
|
212
212
|
|
|
213
213
|
m_u_records_lookup = m_u_records_to_lookup_dict(m_u_records)
|
|
214
|
+
|
|
214
215
|
for c in original_settings_obj.comparisons:
|
|
215
216
|
for cl in c._comparison_levels_excluding_null:
|
|
216
217
|
append_u_probability_to_comparison_level_trained_probabilities(
|
|
@@ -158,13 +158,13 @@ def _search_tree_for_blocking_rules_below_threshold_count(
|
|
|
158
158
|
if len(current_combination) == len(all_columns):
|
|
159
159
|
return results # All fields included, meaning we're at a leaf so exit recursion
|
|
160
160
|
|
|
161
|
-
br = _generate_blocking_rule(linker.
|
|
161
|
+
br = _generate_blocking_rule(linker._db_api, current_combination)
|
|
162
162
|
|
|
163
163
|
comparison_count = _count_comparisons_generated_from_blocking_rule(
|
|
164
164
|
splink_df_dict=linker._input_tables_dict,
|
|
165
165
|
blocking_rule=br,
|
|
166
166
|
link_type=linker._settings_obj._link_type,
|
|
167
|
-
db_api=linker.
|
|
167
|
+
db_api=linker._db_api,
|
|
168
168
|
compute_post_filter_count=False,
|
|
169
169
|
source_dataset_input_column=linker._settings_obj.column_info_settings.source_dataset_input_column,
|
|
170
170
|
unique_id_input_column=linker._settings_obj.column_info_settings.unique_id_input_column,
|
|
@@ -50,9 +50,9 @@ def generate_labelling_tool_comparisons(
|
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
52
|
pipeline.enqueue_sql(sql, "__splink__df_labelling_tool_record")
|
|
53
|
-
splink_df = linker.
|
|
53
|
+
splink_df = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
|
|
54
54
|
|
|
55
|
-
matches = linker.find_matches_to_new_records(
|
|
55
|
+
matches = linker.inference.find_matches_to_new_records(
|
|
56
56
|
splink_df.physical_name, match_weight_threshold=match_weight_threshold
|
|
57
57
|
)
|
|
58
58
|
|