splink 4.0.0.dev8__tar.gz → 4.0.0.dev9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/PKG-INFO +1 -1
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/pyproject.toml +1 -1
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/__init__.py +4 -11
- splink-4.0.0.dev9/splink/backends/duckdb.py +3 -0
- splink-4.0.0.dev9/splink/backends/postgres.py +3 -0
- splink-4.0.0.dev9/splink/backends/spark.py +4 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/accuracy.py +5 -5
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_creator.py +39 -13
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level_composition.py +0 -1
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level_creator.py +56 -22
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level_library.py +5 -2
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_library.py +0 -37
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/database_api.py +3 -10
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/dialects.py +0 -1
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/em_training_session.py +17 -30
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker.py +9 -9
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/evaluation.py +57 -32
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/inference.py +4 -4
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/misc.py +3 -1
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/table_management.py +1 -2
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/training.py +3 -66
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/database_api.py +12 -28
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/splink_dataframe.py +2 -2
- splink-4.0.0.dev8/splink/backends/spark.py +0 -3
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/LICENSE +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/README.md +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/backends/athena.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/backends/sqlite.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/blocking_analysis.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/comparison_level_library.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/comparison_library.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/datasets.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/exploratory.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/athena_helpers/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/athena_helpers/athena_transforms.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/athena_helpers/athena_utils.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/database_api.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/athena/dataframe.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/block_from_labels.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking_analysis.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking_rule_creator.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking_rule_creator_utils.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/blocking_rule_library.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/cache_dict_with_logging.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/charts.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/cluster_studio.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/column_expression.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_level_sql.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_vector_distribution.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/comparison_vector_values.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/completeness.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/connected_components.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/constants.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/cost_of_blocking_rules.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/databricks/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/databricks/enable_splink.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/datasets/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/default_from_jsonschema.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/database_api.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/dataframe.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/duckdb_helpers/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/duckdb/duckdb_helpers/duckdb_helpers.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/edge_metrics.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/estimate_u.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/exceptions.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/expectation_maximisation.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/DEPENDENCY_LICENSES.txt +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/accuracy_chart.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/blocking_rule_generated_comparisons.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/comparator_score_chart.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/comparator_score_threshold_chart.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/completeness.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/m_u_parameters_interactive_history.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/match_weight_histogram.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/match_weights_interactive_history.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/match_weights_waterfall.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/missingness.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/parameter_estimate_comparisons.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/phonetic_match_chart.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/precision_recall.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/probability_two_random_records_match_iteration.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/profile_data.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/roc.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/tf_adjustment_chart.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/threshold_selection_tool.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/chart_defs/unlinkables_chart_def.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/d3@7.8.5 +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/stdlib.js@5.8.3 +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/vega-embed@6.20.2 +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/vega-lite@5.2.0 +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/external_js/vega@5.21.0 +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/labelling_tool/slt.js +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/labelling_tool/template.j2 +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/settings_jsonschema.json +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_classic.jar +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_spark3.3.jar +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.1_spark3.x.jar +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_cluster_studio/cluster_template.j2 +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_cluster_studio/custom.css +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_comparison_viewer/custom.css +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_comparison_viewer/template.j2 +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/splink_vis_utils/splink_vis_utils.js +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/files/templates/single_chart_template.html +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/find_brs_with_comparison_counts_below_threshold.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/find_matches_to_new_records.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/graph_metrics.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/input_column.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/labelling_tool.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/clustering.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/linker_components/visualisations.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/logging_messages.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/lower_id_on_lhs.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/m_from_labels.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/m_training.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/m_u_records_to_parameters.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/match_key_analysis.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/match_weights_histogram.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/misc.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/optimise_cost_of_brs.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/parse_sql.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/pipeline.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/postgres/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/postgres/database_api.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/postgres/dataframe.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/predict.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/profile_data.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_creator.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/log_invalid_columns.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/settings_column_cleaner.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/settings_validation_log_strings.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/settings_validation/valid_types.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/similarity_analysis.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/dataframe.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/jar_location.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/spark_helpers/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/spark/spark_helpers/custom_spark_dialect.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/splink_comparison_viewer.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/sql_transform.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/sqlite/__init__.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/sqlite/database_api.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/sqlite/dataframe.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/term_frequencies.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/unique_id_concat.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/unlinkables.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/validate_jsonschema.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/vertically_concatenate.py +0 -0
- {splink-4.0.0.dev8 → splink-4.0.0.dev9}/splink/internals/waterfall_chart.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "splink"
|
|
3
|
-
version = "4.0.0.
|
|
3
|
+
version = "4.0.0.dev9"
|
|
4
4
|
description = "Fast probabilistic data linkage at scale"
|
|
5
5
|
authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"]
|
|
6
6
|
license = "MIT"
|
|
@@ -6,8 +6,8 @@ from splink.internals.datasets import splink_datasets
|
|
|
6
6
|
from splink.internals.linker import Linker
|
|
7
7
|
from splink.internals.settings_creator import SettingsCreator
|
|
8
8
|
|
|
9
|
-
# The following is a workaround for the fact that dependencies of
|
|
10
|
-
#
|
|
9
|
+
# The following is a workaround for the fact that dependencies of particular backends
|
|
10
|
+
# may not be installed, but we don't want this to prevent import
|
|
11
11
|
# of the other backends.
|
|
12
12
|
|
|
13
13
|
# This enables auto-complete to be used to import the various DBAPIs
|
|
@@ -15,7 +15,6 @@ from splink.internals.settings_creator import SettingsCreator
|
|
|
15
15
|
# without importing them at runtime
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
17
|
from splink.internals.duckdb.database_api import DuckDBAPI
|
|
18
|
-
from splink.internals.postgres.database_api import PostgresAPI
|
|
19
18
|
from splink.internals.spark.database_api import SparkAPI
|
|
20
19
|
|
|
21
20
|
|
|
@@ -30,12 +29,8 @@ def __getattr__(name):
|
|
|
30
29
|
from splink.internals.duckdb.database_api import DuckDBAPI
|
|
31
30
|
|
|
32
31
|
return DuckDBAPI
|
|
33
|
-
elif name == "PostgresAPI":
|
|
34
|
-
from splink.internals.postgres.database_api import PostgresAPI
|
|
35
|
-
|
|
36
|
-
return PostgresAPI
|
|
37
32
|
except ImportError as err:
|
|
38
|
-
if name in ["SparkAPI", "DuckDBAPI"
|
|
33
|
+
if name in ["SparkAPI", "DuckDBAPI"]:
|
|
39
34
|
raise ImportError(
|
|
40
35
|
f"{name} cannot be imported because its dependencies are not "
|
|
41
36
|
"installed. Please `pip install` the required package(s) as "
|
|
@@ -44,7 +39,7 @@ def __getattr__(name):
|
|
|
44
39
|
raise AttributeError(f"module 'splink' has no attribute '{name}'") from None
|
|
45
40
|
|
|
46
41
|
|
|
47
|
-
__version__ = "4.0.0.
|
|
42
|
+
__version__ = "4.0.0.dev9"
|
|
48
43
|
|
|
49
44
|
|
|
50
45
|
__all__ = [
|
|
@@ -52,9 +47,7 @@ __all__ = [
|
|
|
52
47
|
"ColumnExpression",
|
|
53
48
|
"DuckDBAPI",
|
|
54
49
|
"Linker",
|
|
55
|
-
"PostgresAPI",
|
|
56
50
|
"SettingsCreator",
|
|
57
51
|
"SparkAPI",
|
|
58
52
|
"splink_datasets",
|
|
59
|
-
"SQLiteAPI",
|
|
60
53
|
]
|
|
@@ -446,7 +446,7 @@ def prediction_errors_from_labels_table(
|
|
|
446
446
|
labels_tablename: str,
|
|
447
447
|
include_false_positives: bool = True,
|
|
448
448
|
include_false_negatives: bool = True,
|
|
449
|
-
|
|
449
|
+
threshold_match_probability: float = 0.5,
|
|
450
450
|
) -> SplinkDataFrame:
|
|
451
451
|
pipeline = CTEPipeline()
|
|
452
452
|
nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
|
|
@@ -457,13 +457,13 @@ def prediction_errors_from_labels_table(
|
|
|
457
457
|
pipeline.enqueue_list_of_sqls(sqls)
|
|
458
458
|
|
|
459
459
|
false_positives = f"""
|
|
460
|
-
(clerical_match_score < {
|
|
461
|
-
match_probability > {
|
|
460
|
+
(clerical_match_score < {threshold_match_probability} and
|
|
461
|
+
match_probability > {threshold_match_probability})
|
|
462
462
|
"""
|
|
463
463
|
|
|
464
464
|
false_negatives = f"""
|
|
465
|
-
(clerical_match_score > {
|
|
466
|
-
match_probability < {
|
|
465
|
+
(clerical_match_score > {threshold_match_probability} and
|
|
466
|
+
match_probability < {threshold_match_probability})
|
|
467
467
|
"""
|
|
468
468
|
|
|
469
469
|
where_conditions = []
|
|
@@ -7,7 +7,11 @@ from splink.internals.column_expression import ColumnExpression
|
|
|
7
7
|
from splink.internals.exceptions import SplinkException
|
|
8
8
|
|
|
9
9
|
from .comparison import Comparison
|
|
10
|
-
from .comparison_level_creator import
|
|
10
|
+
from .comparison_level_creator import (
|
|
11
|
+
ComparisonLevelCreator,
|
|
12
|
+
UnsuppliedNoneOr,
|
|
13
|
+
unsupplied_option,
|
|
14
|
+
)
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
class ComparisonCreator(ABC):
|
|
@@ -65,7 +69,6 @@ class ComparisonCreator(ABC):
|
|
|
65
69
|
# create levels - let them raise errors if there are issues
|
|
66
70
|
self.create_comparison_levels()
|
|
67
71
|
|
|
68
|
-
# TODO: property?
|
|
69
72
|
@abstractmethod
|
|
70
73
|
def create_comparison_levels(self) -> List[ComparisonLevelCreator]:
|
|
71
74
|
pass
|
|
@@ -77,9 +80,11 @@ class ComparisonCreator(ABC):
|
|
|
77
80
|
|
|
78
81
|
if self.term_frequency_adjustments:
|
|
79
82
|
for cl in comparison_levels:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
+
if (
|
|
84
|
+
hasattr(cl, "col_expression")
|
|
85
|
+
and cl.col_expression.is_pure_column_or_column_reference
|
|
86
|
+
and cl.is_exact_match_level
|
|
87
|
+
):
|
|
83
88
|
cl.term_frequency_adjustments = True
|
|
84
89
|
|
|
85
90
|
if self.m_probabilities:
|
|
@@ -145,23 +150,37 @@ class ComparisonCreator(ABC):
|
|
|
145
150
|
def configure(
|
|
146
151
|
self,
|
|
147
152
|
*,
|
|
148
|
-
term_frequency_adjustments: bool =
|
|
149
|
-
m_probabilities: List[float] =
|
|
150
|
-
u_probabilities: List[float] =
|
|
153
|
+
term_frequency_adjustments: UnsuppliedNoneOr[bool] = unsupplied_option,
|
|
154
|
+
m_probabilities: UnsuppliedNoneOr[List[float]] = unsupplied_option,
|
|
155
|
+
u_probabilities: UnsuppliedNoneOr[List[float]] = unsupplied_option,
|
|
151
156
|
) -> "ComparisonCreator":
|
|
152
157
|
"""
|
|
153
|
-
Configure the comparison creator with
|
|
158
|
+
Configure the comparison creator with options that are common to all
|
|
159
|
+
comparisons.
|
|
160
|
+
|
|
161
|
+
For m and u probabilities, the first
|
|
154
162
|
element in the list corresponds to the first comparison level, usually
|
|
155
163
|
an exact match level. Subsequent elements correspond comparison to
|
|
156
164
|
levels in sequential order, through to the last element which is usually
|
|
157
165
|
the 'ELSE' level.
|
|
158
166
|
|
|
167
|
+
All options have default options set initially. Any call to `.configure()`
|
|
168
|
+
will set any options that are supplied. Any subsequent calls to `.configure()`
|
|
169
|
+
will not override these values with defaults; to override values you must
|
|
170
|
+
explicitly provide a value corresponding to the default.
|
|
171
|
+
|
|
172
|
+
Generally speaking only a single call (at most) to `.configure()` should
|
|
173
|
+
be required.
|
|
174
|
+
|
|
159
175
|
Args:
|
|
160
176
|
term_frequency_adjustments (bool, optional): Whether term frequency
|
|
161
177
|
adjustments are switched on for this comparison. Only applied
|
|
162
|
-
to exact match levels.
|
|
178
|
+
to exact match levels.
|
|
179
|
+
Default corresponds to False.
|
|
163
180
|
m_probabilities (list, optional): List of m probabilities
|
|
181
|
+
Default corresponds to None.
|
|
164
182
|
u_probabilities (list, optional): List of u probabilities
|
|
183
|
+
Default corresponds to None.
|
|
165
184
|
|
|
166
185
|
Example:
|
|
167
186
|
```py
|
|
@@ -175,9 +194,16 @@ class ComparisonCreator(ABC):
|
|
|
175
194
|
```
|
|
176
195
|
|
|
177
196
|
"""
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
197
|
+
configurables = {
|
|
198
|
+
"term_frequency_adjustments": term_frequency_adjustments,
|
|
199
|
+
"m_probabilities": m_probabilities,
|
|
200
|
+
"u_probabilities": u_probabilities,
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
for attribute_name, attribute_value in configurables.items():
|
|
204
|
+
if attribute_value is not unsupplied_option:
|
|
205
|
+
setattr(self, attribute_name, attribute_value)
|
|
206
|
+
|
|
181
207
|
return self
|
|
182
208
|
|
|
183
209
|
@property
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from inspect import signature
|
|
5
|
-
from typing import Any, final
|
|
5
|
+
from typing import Any, TypeVar, Union, final
|
|
6
6
|
|
|
7
7
|
from splink.internals.column_expression import ColumnExpression
|
|
8
8
|
from splink.internals.dialects import SplinkDialect
|
|
@@ -10,6 +10,22 @@ from splink.internals.dialects import SplinkDialect
|
|
|
10
10
|
from .comparison_level import ComparisonLevel
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
class _UnsuppliedOption:
|
|
14
|
+
_instance: "_UnsuppliedOption" | None = None
|
|
15
|
+
|
|
16
|
+
def __new__(cls):
|
|
17
|
+
if cls._instance is None:
|
|
18
|
+
cls._instance = super(_UnsuppliedOption, cls).__new__(cls)
|
|
19
|
+
return cls._instance
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
unsupplied_option = _UnsuppliedOption()
|
|
23
|
+
|
|
24
|
+
T = TypeVar("T")
|
|
25
|
+
# type alias - either the specified type, _UnsuppliedOption, or None
|
|
26
|
+
UnsuppliedNoneOr = Union[T, _UnsuppliedOption, None]
|
|
27
|
+
|
|
28
|
+
|
|
13
29
|
class ComparisonLevelCreator(ABC):
|
|
14
30
|
# off by default - only a small subset should have tf adjustments
|
|
15
31
|
term_frequency_adjustments = False
|
|
@@ -56,14 +72,14 @@ class ComparisonLevelCreator(ABC):
|
|
|
56
72
|
def configure(
|
|
57
73
|
self,
|
|
58
74
|
*,
|
|
59
|
-
m_probability: float =
|
|
60
|
-
u_probability: float =
|
|
61
|
-
tf_adjustment_column: str =
|
|
62
|
-
tf_adjustment_weight: float =
|
|
63
|
-
tf_minimum_u_value: float =
|
|
64
|
-
is_null_level: bool =
|
|
65
|
-
label_for_charts: str =
|
|
66
|
-
disable_tf_exact_match_detection: bool =
|
|
75
|
+
m_probability: UnsuppliedNoneOr[float] = unsupplied_option,
|
|
76
|
+
u_probability: UnsuppliedNoneOr[float] = unsupplied_option,
|
|
77
|
+
tf_adjustment_column: UnsuppliedNoneOr[str] = unsupplied_option,
|
|
78
|
+
tf_adjustment_weight: UnsuppliedNoneOr[float] = unsupplied_option,
|
|
79
|
+
tf_minimum_u_value: UnsuppliedNoneOr[float] = unsupplied_option,
|
|
80
|
+
is_null_level: UnsuppliedNoneOr[bool] = unsupplied_option,
|
|
81
|
+
label_for_charts: UnsuppliedNoneOr[str] = unsupplied_option,
|
|
82
|
+
disable_tf_exact_match_detection: UnsuppliedNoneOr[bool] = unsupplied_option,
|
|
67
83
|
) -> "ComparisonLevelCreator":
|
|
68
84
|
"""
|
|
69
85
|
Configure the comparison level with options which are common to all
|
|
@@ -71,29 +87,47 @@ class ComparisonLevelCreator(ABC):
|
|
|
71
87
|
specification of a comparison level. These options are usually not
|
|
72
88
|
needed, but are available for advanced users.
|
|
73
89
|
|
|
90
|
+
All options have default options set initially. Any call to `.configure()`
|
|
91
|
+
will set any options that are supplied. Any subsequent calls to `.configure()`
|
|
92
|
+
will not override these values with defaults; to override values you must must
|
|
93
|
+
explicitly provide a value corresponding to the default.
|
|
94
|
+
|
|
95
|
+
Generally speaking only a single call (at most) to `.configure()` should
|
|
96
|
+
be required.
|
|
74
97
|
|
|
75
98
|
Args:
|
|
76
99
|
m_probability (float, optional): The m probability for this
|
|
77
|
-
comparison level.
|
|
100
|
+
comparison level.
|
|
101
|
+
Default is equivalent to None, in which case a default initial value
|
|
102
|
+
will be provided for this level.
|
|
78
103
|
u_probability (float, optional): The u probability for this
|
|
79
|
-
comparison level.
|
|
104
|
+
comparison level.
|
|
105
|
+
Default is equivalent to None, in which case a default initial value
|
|
106
|
+
will be provided for this level.
|
|
80
107
|
tf_adjustment_column (str, optional): Make term frequency adjustments for
|
|
81
|
-
this comparison level using this input column.
|
|
82
|
-
|
|
108
|
+
this comparison level using this input column.
|
|
109
|
+
Default is equivalent to None, meaning that term-frequency adjustments
|
|
110
|
+
will not be applied for this level.
|
|
83
111
|
tf_adjustment_weight (float, optional): Make term frequency adjustments
|
|
84
|
-
for this comparison level using this weight.
|
|
85
|
-
meaning term-frequency adjustments are
|
|
112
|
+
for this comparison level using this weight.
|
|
113
|
+
Default is equivalent to None, meaning term-frequency adjustments are
|
|
114
|
+
fully-weighted if turned on.
|
|
86
115
|
tf_minimum_u_value (float, optional): When term frequency adjustments are
|
|
87
116
|
turned on, where the term frequency adjustment implies a u value below
|
|
88
|
-
this value, use this minimum value instead.
|
|
89
|
-
no minimum value.
|
|
117
|
+
this value, use this minimum value instead.
|
|
118
|
+
Defaults is equivalent to None, meaning no minimum value.
|
|
90
119
|
is_null_level (bool, optional): If true, m and u values will not be
|
|
91
120
|
estimated and instead the match weight will be zero for this column.
|
|
92
|
-
|
|
121
|
+
Default is equivalent to False.
|
|
93
122
|
label_for_charts (str, optional): If provided, a custom label that will
|
|
94
|
-
be used for this level in any charts.
|
|
95
|
-
a default label will be
|
|
96
|
-
|
|
123
|
+
be used for this level in any charts.
|
|
124
|
+
Default is equivalent to None, in which case a default label will be
|
|
125
|
+
provided for this level.
|
|
126
|
+
disable_tf_exact_match_detection (bool, optional): If true, if term
|
|
127
|
+
frequency adjustments are set, the corresponding adjustment will be
|
|
128
|
+
made using the u-value for _this_ level, rather than the usual case
|
|
129
|
+
where it is the u-value of the exact match level in the same comparison.
|
|
130
|
+
Default is equivalent to False.
|
|
97
131
|
Returns:
|
|
98
132
|
ComparisonLevelCreator: The instance of the ComparisonLevelCreator class
|
|
99
133
|
with the updated configuration.
|
|
@@ -101,7 +135,7 @@ class ComparisonLevelCreator(ABC):
|
|
|
101
135
|
args = locals()
|
|
102
136
|
del args["self"]
|
|
103
137
|
for k, v in args.items():
|
|
104
|
-
if v is not
|
|
138
|
+
if v is not unsupplied_option:
|
|
105
139
|
setattr(self, k, v)
|
|
106
140
|
|
|
107
141
|
return self
|
|
@@ -256,8 +256,11 @@ class ExactMatchLevel(ComparisonLevelCreator):
|
|
|
256
256
|
tf_adjustment_column=self.col_expression.raw_sql_expression,
|
|
257
257
|
tf_adjustment_weight=1.0,
|
|
258
258
|
)
|
|
259
|
-
|
|
260
|
-
|
|
259
|
+
else:
|
|
260
|
+
self.configure(
|
|
261
|
+
tf_adjustment_column=None,
|
|
262
|
+
tf_adjustment_weight=None,
|
|
263
|
+
)
|
|
261
264
|
|
|
262
265
|
def create_sql(self, sql_dialect: SplinkDialect) -> str:
|
|
263
266
|
self.col_expression.sql_dialect = sql_dialect
|
|
@@ -615,7 +615,6 @@ class DateOfBirthComparison(ComparisonCreator):
|
|
|
615
615
|
"year",
|
|
616
616
|
],
|
|
617
617
|
datetime_format: str = None,
|
|
618
|
-
separate_1st_january: bool = False,
|
|
619
618
|
invalid_dates_as_null: bool = True,
|
|
620
619
|
):
|
|
621
620
|
"""
|
|
@@ -644,10 +643,6 @@ class DateOfBirthComparison(ComparisonCreator):
|
|
|
644
643
|
Metrics for date differences. Defaults to ["month", "year", "year"].
|
|
645
644
|
datetime_format (str, optional): The datetime format used to cast strings
|
|
646
645
|
to dates. Only used if input is a string.
|
|
647
|
-
separate_1st_january (bool, optional): Used for when date of birth is
|
|
648
|
-
sometimes recorded as 1st of Jan when only the year is known / If True,
|
|
649
|
-
a level is included for for a match on the year where at least one
|
|
650
|
-
side of the match is a date on the the 1st of January.
|
|
651
646
|
invalid_dates_as_null (bool, optional): If True, treat invalid dates as null
|
|
652
647
|
as opposed to allowing e.g. an exact or levenshtein match where one side
|
|
653
648
|
or both are an invalid date. Only used if input is a string. Defaults
|
|
@@ -672,8 +667,6 @@ class DateOfBirthComparison(ComparisonCreator):
|
|
|
672
667
|
|
|
673
668
|
self.datetime_format = datetime_format
|
|
674
669
|
|
|
675
|
-
self.separate_1st_january = separate_1st_january
|
|
676
|
-
|
|
677
670
|
self.input_is_string = input_is_string
|
|
678
671
|
self.invalid_dates_as_null = invalid_dates_as_null
|
|
679
672
|
|
|
@@ -693,36 +686,6 @@ class DateOfBirthComparison(ComparisonCreator):
|
|
|
693
686
|
cll.NullLevel(null_col),
|
|
694
687
|
]
|
|
695
688
|
|
|
696
|
-
if self.input_is_string:
|
|
697
|
-
date_as_iso_string = self.datetime_parse_function(
|
|
698
|
-
self.datetime_format
|
|
699
|
-
).cast_to_string()
|
|
700
|
-
else:
|
|
701
|
-
date_as_iso_string = self.col_expression.cast_to_string()
|
|
702
|
-
|
|
703
|
-
if self.separate_1st_january:
|
|
704
|
-
level = cll.And(
|
|
705
|
-
cll.Or(
|
|
706
|
-
cll.LiteralMatchLevel(
|
|
707
|
-
date_as_iso_string.substr(6, 5),
|
|
708
|
-
literal_value="01-01",
|
|
709
|
-
literal_datatype="string",
|
|
710
|
-
side_of_comparison="left",
|
|
711
|
-
),
|
|
712
|
-
cll.LiteralMatchLevel(
|
|
713
|
-
date_as_iso_string.substr(6, 5),
|
|
714
|
-
literal_value="01-01",
|
|
715
|
-
literal_datatype="string",
|
|
716
|
-
side_of_comparison="right",
|
|
717
|
-
),
|
|
718
|
-
),
|
|
719
|
-
cll.ExactMatchLevel(date_as_iso_string.substr(0, 4)),
|
|
720
|
-
)
|
|
721
|
-
|
|
722
|
-
level.configure(label_for_charts="Exact match on year, 1st Jan only")
|
|
723
|
-
|
|
724
|
-
levels.append(level)
|
|
725
|
-
|
|
726
689
|
levels.append(
|
|
727
690
|
cll.ExactMatchLevel(self.col_expression).configure(
|
|
728
691
|
label_for_charts="Exact match on date of birth"
|
|
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import hashlib
|
|
4
4
|
import logging
|
|
5
|
-
import random
|
|
6
5
|
import time
|
|
7
6
|
from abc import ABC, abstractmethod
|
|
8
7
|
from collections.abc import Sequence
|
|
@@ -47,8 +46,7 @@ class DatabaseAPI(ABC, Generic[TablishType]):
|
|
|
47
46
|
|
|
48
47
|
def __init__(self) -> None:
|
|
49
48
|
self._intermediate_table_cache: CacheDictWithLogging = CacheDictWithLogging()
|
|
50
|
-
|
|
51
|
-
self._cache_uid: str = str(random.choice(range(10000)))
|
|
49
|
+
self._cache_uid: str = ascii_uid(8)
|
|
52
50
|
|
|
53
51
|
@final
|
|
54
52
|
def _log_and_run_sql_execution(
|
|
@@ -80,7 +78,6 @@ class DatabaseAPI(ABC, Generic[TablishType]):
|
|
|
80
78
|
f"\n\nError was: {e}"
|
|
81
79
|
) from e
|
|
82
80
|
|
|
83
|
-
# TODO: rename this?
|
|
84
81
|
@final
|
|
85
82
|
def _sql_to_splink_dataframe(
|
|
86
83
|
self, sql: str, templated_name: str, physical_name: str
|
|
@@ -140,9 +137,8 @@ class DatabaseAPI(ABC, Generic[TablishType]):
|
|
|
140
137
|
use_cache: bool = True,
|
|
141
138
|
) -> SplinkDataFrame:
|
|
142
139
|
# differences from _sql_to_splink_dataframe:
|
|
143
|
-
# this _calculates_ physical name,
|
|
144
|
-
#
|
|
145
|
-
# TODO: also maybe caching? but maybe that is even lower down
|
|
140
|
+
# this _calculates_ physical name, handles debug_mode,
|
|
141
|
+
# and checks cache before querying
|
|
146
142
|
to_hash = (sql + self._cache_uid).encode("utf-8")
|
|
147
143
|
hash = hashlib.sha256(to_hash).hexdigest()[:9]
|
|
148
144
|
# Ensure hash is valid sql table name
|
|
@@ -342,9 +338,6 @@ class DatabaseAPI(ABC, Generic[TablishType]):
|
|
|
342
338
|
input_tables = ensure_is_list(input_tables)
|
|
343
339
|
return input_tables
|
|
344
340
|
|
|
345
|
-
# should probably also be responsible for cache
|
|
346
|
-
# TODO: stick this in a cache-api that lives on this
|
|
347
|
-
|
|
348
341
|
def remove_splinkdataframe_from_cache(
|
|
349
342
|
self, splink_dataframe: SplinkDataFrame
|
|
350
343
|
) -> None:
|
|
@@ -214,7 +214,6 @@ class DuckDBDialect(SplinkDialect):
|
|
|
214
214
|
) -> str:
|
|
215
215
|
return f"regexp_extract({name}, '{pattern}', {capture_group})"
|
|
216
216
|
|
|
217
|
-
# TODO: roll out to other dialects, at least for now
|
|
218
217
|
@property
|
|
219
218
|
def infinity_expression(self):
|
|
220
219
|
return "cast('infinity' as float8)"
|
|
@@ -11,7 +11,6 @@ from splink.internals.charts import (
|
|
|
11
11
|
probability_two_random_records_match_iteration_chart,
|
|
12
12
|
)
|
|
13
13
|
from splink.internals.comparison import Comparison
|
|
14
|
-
from splink.internals.comparison_level import ComparisonLevel
|
|
15
14
|
from splink.internals.comparison_vector_values import (
|
|
16
15
|
compute_comparison_vector_values_from_id_pairs_sqls,
|
|
17
16
|
)
|
|
@@ -57,8 +56,6 @@ class EMTrainingSession:
|
|
|
57
56
|
fix_u_probabilities: bool = False,
|
|
58
57
|
fix_m_probabilities: bool = False,
|
|
59
58
|
fix_probability_two_random_records_match: bool = False,
|
|
60
|
-
comparisons_to_deactivate: list[Comparison] = None,
|
|
61
|
-
comparison_levels_to_reverse_blocking_rule: list[ComparisonLevel] = None,
|
|
62
59
|
estimate_without_term_frequencies: bool = False,
|
|
63
60
|
):
|
|
64
61
|
logger.info("\n----- Starting EM training session -----\n")
|
|
@@ -77,20 +74,13 @@ class EMTrainingSession:
|
|
|
77
74
|
self._blocking_rule_for_training = blocking_rule_for_training
|
|
78
75
|
self.estimate_without_term_frequencies = estimate_without_term_frequencies
|
|
79
76
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
self._comparison_levels_to_reverse_blocking_rule: list[
|
|
88
|
-
ComparisonAndLevelDict
|
|
89
|
-
] = Settings._get_comparison_levels_corresponding_to_training_blocking_rule( # noqa
|
|
90
|
-
blocking_rule_sql=blocking_rule_for_training.blocking_rule_sql,
|
|
91
|
-
sqlglot_dialect_name=self.db_api.sql_dialect.sqlglot_name,
|
|
92
|
-
comparisons=core_model_settings.comparisons,
|
|
93
|
-
)
|
|
77
|
+
self._comparison_levels_to_reverse_blocking_rule: list[
|
|
78
|
+
ComparisonAndLevelDict
|
|
79
|
+
] = Settings._get_comparison_levels_corresponding_to_training_blocking_rule( # noqa
|
|
80
|
+
blocking_rule_sql=blocking_rule_for_training.blocking_rule_sql,
|
|
81
|
+
sqlglot_dialect_name=self.db_api.sql_dialect.sqlglot_name,
|
|
82
|
+
comparisons=core_model_settings.comparisons,
|
|
83
|
+
)
|
|
94
84
|
|
|
95
85
|
# batch together fixed probabilities rather than keep hold of the bools
|
|
96
86
|
self.training_fixed_probabilities: set[str] = {
|
|
@@ -104,19 +94,16 @@ class EMTrainingSession:
|
|
|
104
94
|
}
|
|
105
95
|
|
|
106
96
|
# Remove comparison columns which are either 'used up' by the blocking rules
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
cc_cols = [c.input_name for c in cc_cols]
|
|
118
|
-
if set(br_cols).intersection(cc_cols):
|
|
119
|
-
comparisons_to_deactivate.append(cc)
|
|
97
|
+
comparisons_to_deactivate = []
|
|
98
|
+
br_cols = get_columns_used_from_sql(
|
|
99
|
+
blocking_rule_for_training.blocking_rule_sql,
|
|
100
|
+
self.db_api.sql_dialect.sqlglot_name,
|
|
101
|
+
)
|
|
102
|
+
for cc in core_model_settings.comparisons:
|
|
103
|
+
cc_cols = cc._input_columns_used_by_case_statement
|
|
104
|
+
cc_cols = [c.input_name for c in cc_cols]
|
|
105
|
+
if set(br_cols).intersection(cc_cols):
|
|
106
|
+
comparisons_to_deactivate.append(cc)
|
|
120
107
|
cc_names_to_deactivate = [
|
|
121
108
|
cc.output_column_name for cc in comparisons_to_deactivate
|
|
122
109
|
]
|
|
@@ -74,7 +74,7 @@ class Linker:
|
|
|
74
74
|
self,
|
|
75
75
|
input_table_or_tables: str | list[str],
|
|
76
76
|
settings: SettingsCreator | dict[str, Any] | Path | str,
|
|
77
|
-
|
|
77
|
+
db_api: DatabaseAPISubClass,
|
|
78
78
|
set_up_basic_logging: bool = True,
|
|
79
79
|
input_table_aliases: str | list[str] | None = None,
|
|
80
80
|
validate_settings: bool = True,
|
|
@@ -112,10 +112,12 @@ class Linker:
|
|
|
112
112
|
database) for link_only or link_and_dedupe. For some linkers, such as
|
|
113
113
|
the DuckDBLinker and the SparkLinker, it's also possible to pass in
|
|
114
114
|
dataframes (Pandas and Spark respectively) rather than strings.
|
|
115
|
-
settings_dict (dict | Path
|
|
116
|
-
path
|
|
117
|
-
|
|
118
|
-
|
|
115
|
+
settings_dict (dict | Path | str): A Splink settings dictionary,
|
|
116
|
+
or a path (either as a pathlib.Path object, or a string) to a json file
|
|
117
|
+
defining a settings dictionary or pre-trained model.
|
|
118
|
+
db_api (DatabaseAPI): A `DatabaseAPI` object, which manages interactions
|
|
119
|
+
with the database. You can import these for use from
|
|
120
|
+
`splink.backends.{your_backend}`
|
|
119
121
|
set_up_basic_logging (bool, optional): If true, sets ups up basic logging
|
|
120
122
|
so that Splink sends messages at INFO level to stdout. Defaults to True.
|
|
121
123
|
input_table_aliases (Union[str, list], optional): Labels assigned to
|
|
@@ -133,7 +135,7 @@ class Linker:
|
|
|
133
135
|
splink_logger = logging.getLogger("splink")
|
|
134
136
|
splink_logger.setLevel(logging.INFO)
|
|
135
137
|
|
|
136
|
-
self._db_api =
|
|
138
|
+
self._db_api = db_api
|
|
137
139
|
|
|
138
140
|
# TODO: temp hack for compat
|
|
139
141
|
self._intermediate_table_cache: CacheDictWithLogging = (
|
|
@@ -154,9 +156,7 @@ class Linker:
|
|
|
154
156
|
# or overwrite it with the db api dialect?
|
|
155
157
|
# Maybe overwrite it here and incompatibilities have to be dealt with
|
|
156
158
|
# by comparisons/ blocking rules etc??
|
|
157
|
-
self._settings_obj = settings_creator.get_settings(
|
|
158
|
-
database_api.sql_dialect.name
|
|
159
|
-
)
|
|
159
|
+
self._settings_obj = settings_creator.get_settings(db_api.sql_dialect.name)
|
|
160
160
|
|
|
161
161
|
# TODO: Add test of what happens if the db_api is for a different backend
|
|
162
162
|
# to the sql_dialect set in the settings dict
|