PyPI - splink - Versions diffs - 4.0.0.dev5__tar.gz → 4.0.0.dev6__tar.gz - Mend

splink 4.0.0.dev5tar.gz → 4.0.0.dev6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (154) hide show

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: splink
-Version: 4.0.0.dev5
+Version: 4.0.0.dev6
 Summary: Fast probabilistic data linkage at scale
 Home-page: https://github.com/moj-analytical-services/splink
 License: MIT

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "splink"
-version = "4.0.0.dev5"
+version = "4.0.0.dev6"
 description = "Fast probabilistic data linkage at scale"
 authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"]
 license = "MIT"

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/__init__.py RENAMED Viewed

@@ -44,7 +44,7 @@ def __getattr__(name):
     raise AttributeError(f"module 'splink' has no attribute '{name}'") from None
-__version__ = "4.0.0.dev5"
+__version__ = "4.0.0.dev6"
 __all__ = [

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/accuracy.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from copy import deepcopy
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 from splink.internals.block_from_labels import block_from_labels
 from splink.internals.blocking import BlockingRule
@@ -307,8 +307,11 @@ def _select_found_by_blocking_rules(linker: "Linker") -> str:
 def truth_space_table_from_labels_table(
-    linker, labels_tablename, threshold_actual=0.5, match_weight_round_to_nearest=None
-):
+    linker: Linker,
+    labels_tablename: str,
+    threshold_actual: float = 0.5,
+    match_weight_round_to_nearest: Optional[float] = None,
+) -> SplinkDataFrame:
     pipeline = CTEPipeline()
     nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
@@ -323,7 +326,7 @@ def truth_space_table_from_labels_table(
     )
     pipeline.enqueue_list_of_sqls(sqls)
-    df_truth_space_table = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_truth_space_table = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return df_truth_space_table
@@ -356,7 +359,7 @@ def truth_space_table_from_labels_column(
     """
     pipeline.enqueue_sql(sql, "__splink__cartesian_product")
-    cartesian_count = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    cartesian_count = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     row_count_df = cartesian_count.as_record_dict()
     cartesian_count.drop_table_from_database_and_remove_from_cache()
@@ -393,7 +396,7 @@ def truth_space_table_from_labels_column(
     )
     pipeline.enqueue_list_of_sqls(sqls)
-    df_truth_space_table = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_truth_space_table = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return df_truth_space_table
@@ -439,12 +442,12 @@ def predictions_from_sample_of_pairwise_labels_sql(linker, labels_tablename):
 def prediction_errors_from_labels_table(
-    linker,
-    labels_tablename,
-    include_false_positives=True,
-    include_false_negatives=True,
-    threshold=0.5,
-):
+    linker: Linker,
+    labels_tablename: str,
+    include_false_positives: bool = True,
+    include_false_negatives: bool = True,
+    threshold: float = 0.5,
+) -> SplinkDataFrame:
     pipeline = CTEPipeline()
     nodes_with_tf = compute_df_concat_with_tf(linker, pipeline)
     pipeline = CTEPipeline([nodes_with_tf])
@@ -486,7 +489,7 @@ def prediction_errors_from_labels_table(
     pipeline.enqueue_sql(sql, "__splink__labels_with_fp_fn_status")
-    return linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    return linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
 def _predict_from_label_column_sql(linker, label_colname):
@@ -509,18 +512,18 @@ def _predict_from_label_column_sql(linker, label_colname):
         settings._additional_column_names_to_retain.append(label_colname)
     # Now we want to create predictions
-    df_predict = linker.predict()
+    df_predict = linker.inference.predict()
     return df_predict
 def prediction_errors_from_label_column(
-    linker,
-    label_colname,
-    include_false_positives=True,
-    include_false_negatives=True,
-    threshold=0.5,
-):
+    linker: Linker,
+    label_colname: str,
+    include_false_positives: bool = True,
+    include_false_negatives: bool = True,
+    threshold: float = 0.5,
+) -> SplinkDataFrame:
     df_predict = _predict_from_label_column_sql(
         linker,
         label_colname,
@@ -577,6 +580,6 @@ def prediction_errors_from_label_column(
     pipeline.enqueue_sql(sql, "__splink__predictions_from_label_column_fp_fn_only")
-    predictions = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    predictions = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return predictions

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/cluster_studio.py RENAMED Viewed

@@ -63,7 +63,7 @@ def df_clusters_as_records(
     sql = _clusters_sql(df_clustered_nodes, cluster_ids)
     pipeline = CTEPipeline()
     pipeline.enqueue_sql(sql, "__splink__scs_clusters")
-    df_clusters = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_clusters = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return df_clusters.as_record_dict()
@@ -107,7 +107,7 @@ def create_df_nodes(
     pipeline = CTEPipeline()
     sql = _nodes_sql(df_clustered_nodes, cluster_ids)
     pipeline.enqueue_sql(sql, "__splink__scs_nodes")
-    df_nodes = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_nodes = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return df_nodes
@@ -151,7 +151,7 @@ def df_edges_as_records(
     sql = _edges_sql(linker, df_predicted_edges, df_nodes)
     pipeline = CTEPipeline()
     pipeline.enqueue_sql(sql, "__splink__scs_edges")
-    df_edges = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_edges = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return df_edges.as_record_dict()
@@ -168,7 +168,7 @@ def _get_random_cluster_ids(
     """
     pipeline = CTEPipeline()
     pipeline.enqueue_sql(sql, "__splink__cluster_count")
-    df_cluster_count = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_cluster_count = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     cluster_count = df_cluster_count.as_record_dict()[0]["count"]
     df_cluster_count.drop_table_from_database_and_remove_from_cache()
@@ -192,7 +192,7 @@ def _get_random_cluster_ids(
     """
     pipeline = CTEPipeline()
     pipeline.enqueue_sql(sql, "__splink__df_concat_with_tf_sample")
-    df_sample = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_sample = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return [r["cluster_id"] for r in df_sample.as_record_dict()]
@@ -234,7 +234,7 @@ def _get_cluster_id_of_each_size(
     """
     pipeline.enqueue_sql(sql, "__splink__cluster_count_row_numbered")
-    df_cluster_sample_with_size = linker.db_api.sql_pipeline_to_splink_dataframe(
+    df_cluster_sample_with_size = linker._db_api.sql_pipeline_to_splink_dataframe(
         pipeline
     )
@@ -285,7 +285,7 @@ def _get_lowest_density_clusters(
     """
     pipeline.enqueue_sql(sql, "__splink__lowest_density_clusters")
-    df_lowest_density_clusters = linker.db_api.sql_pipeline_to_splink_dataframe(
+    df_lowest_density_clusters = linker._db_api.sql_pipeline_to_splink_dataframe(
         pipeline
     )

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/connected_components.py RENAMED Viewed

@@ -355,7 +355,7 @@ def _cc_create_unique_id_cols(
     """
     pipeline = CTEPipeline()
     pipeline.enqueue_sql(sql, "__splink__df_connected_components_df")
-    return linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    return linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
 def _exit_query(
@@ -453,7 +453,7 @@ def solve_connected_components(
     pipeline.enqueue_sql(sql, "nodes")
     sql = _cc_generate_neighbours_representation()
     pipeline.enqueue_sql(sql, "__splink__df_neighbours")
-    neighbours = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    neighbours = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     # Create our initial representatives table
     pipeline = CTEPipeline([neighbours])
@@ -465,7 +465,7 @@ def solve_connected_components(
     # Execute if we have no batching, otherwise add it to our batched process
     pipeline.enqueue_sql(sql, "__splink__df_representatives")
-    representatives = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    representatives = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     prev_representatives_table = representatives
     # Loop while our representative table still has unsettled nodes
@@ -500,7 +500,7 @@ def solve_connected_components(
             repr_name,
         )
-        representatives = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+        representatives = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
         pipeline = CTEPipeline()
         # Update table reference
@@ -512,7 +512,7 @@ def solve_connected_components(
         pipeline.enqueue_sql(sql, "__splink__df_root_rows")
-        root_rows_df = linker.db_api.sql_pipeline_to_splink_dataframe(
+        root_rows_df = linker._db_api.sql_pipeline_to_splink_dataframe(
             pipeline, use_cache=False
         )
@@ -540,6 +540,6 @@ def solve_connected_components(
     )
     pipeline = CTEPipeline([representatives])
     pipeline.enqueue_sql(exit_query, "__splink__df_representatives")
-    representatives = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    representatives = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return representatives

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/edge_metrics.py RENAMED Viewed

@@ -68,7 +68,7 @@ def compute_basic_edge_metrics(
     )
     pipeline.enqueue_sql(**sql_info)
-    df_truncated_edges = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_truncated_edges = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return df_truncated_edges
@@ -96,13 +96,13 @@ def compute_igraph_metrics(
     # this is how igraph deals with nodes
     sql_infos = _node_mapping_table_sql(df_node_metrics)
     pipeline.enqueue_list_of_sqls(sql_infos)
-    df_node_mappings = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_node_mappings = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     # we keep only edges at or above relevant threshold
     pipeline = CTEPipeline()
     sql_info = _truncated_edges_sql(df_predict, threshold_match_probability)
     pipeline.enqueue_sql(**sql_info)
-    df_truncated_edges = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_truncated_edges = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     # we map the truncated edges to the integer encoding for nodes above,
     # keeping only the list of endpoints
@@ -114,7 +114,7 @@ def compute_igraph_metrics(
         composite_uid_edges_r,
     )
     pipeline.enqueue_sql(**sql_info)
-    edges_for_igraph = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    edges_for_igraph = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     # we will need to manually register a table, so we use the hash from this table
     igraph_edges_hash = edges_for_igraph.physical_name[-9:]
     # NB: for large data we may have to revise this and process in chunks
@@ -124,7 +124,7 @@ def compute_igraph_metrics(
     igraph_df = ig.Graph.DataFrame(df_edges_for_igraph, directed=False)
     bridges_indices = igraph_df.bridges()
     df_bridges_pd = df_edges_for_igraph.iloc[bridges_indices, :]
-    df_bridges = linker.register_table(
+    df_bridges = linker.table_management.register_table(
         df_bridges_pd, f"__splink__bridges_{igraph_edges_hash}"
     )
     # map our bridge edges back to the original node labelling
@@ -139,5 +139,5 @@ def compute_igraph_metrics(
         composite_uid_edges_r,
     )
     pipeline.enqueue_sql(**sql_info)
-    df_edge_metrics = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    df_edge_metrics = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
     return df_edge_metrics

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/estimate_u.py RENAMED Viewed

@@ -74,7 +74,7 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non
     settings_obj._retain_matching_columns = False
     settings_obj._retain_intermediate_calculation_columns = False
-    db_api = training_linker.db_api
+    db_api = training_linker._db_api
     for cc in settings_obj.comparisons:
         for cl in cc.comparison_levels:
@@ -211,6 +211,7 @@ def estimate_u_values(linker: Linker, max_pairs: float, seed: int = None) -> Non
     ]
     m_u_records_lookup = m_u_records_to_lookup_dict(m_u_records)
     for c in original_settings_obj.comparisons:
         for cl in c._comparison_levels_excluding_null:
             append_u_probability_to_comparison_level_trained_probabilities(

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/find_brs_with_comparison_counts_below_threshold.py RENAMED Viewed

@@ -158,13 +158,13 @@ def _search_tree_for_blocking_rules_below_threshold_count(
     if len(current_combination) == len(all_columns):
         return results  # All fields included, meaning we're at a leaf so exit recursion
-    br = _generate_blocking_rule(linker.db_api, current_combination)
+    br = _generate_blocking_rule(linker._db_api, current_combination)
     comparison_count = _count_comparisons_generated_from_blocking_rule(
         splink_df_dict=linker._input_tables_dict,
         blocking_rule=br,
         link_type=linker._settings_obj._link_type,
-        db_api=linker.db_api,
+        db_api=linker._db_api,
         compute_post_filter_count=False,
         source_dataset_input_column=linker._settings_obj.column_info_settings.source_dataset_input_column,
         unique_id_input_column=linker._settings_obj.column_info_settings.unique_id_input_column,

{splink-4.0.0.dev5 → splink-4.0.0.dev6}/splink/internals/labelling_tool.py RENAMED Viewed

@@ -50,9 +50,9 @@ def generate_labelling_tool_comparisons(
     """
     pipeline.enqueue_sql(sql, "__splink__df_labelling_tool_record")
-    splink_df = linker.db_api.sql_pipeline_to_splink_dataframe(pipeline)
+    splink_df = linker._db_api.sql_pipeline_to_splink_dataframe(pipeline)
-    matches = linker.find_matches_to_new_records(
+    matches = linker.inference.find_matches_to_new_records(
         splink_df.physical_name, match_weight_threshold=match_weight_threshold
     )

splink 4.0.0.dev5__tar.gz → 4.0.0.dev6__tar.gz

splink 4.0.0.dev5tar.gz → 4.0.0.dev6tar.gz