splink 4.0.0.dev6__tar.gz → 4.0.0.dev8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/PKG-INFO +48 -44
  2. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/README.md +47 -42
  3. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/pyproject.toml +1 -2
  4. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/__init__.py +1 -1
  5. splink-4.0.0.dev8/splink/backends/athena.py +3 -0
  6. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/blocking_analysis.py +2 -0
  7. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/comparison_library.py +10 -0
  8. splink-4.0.0.dev8/splink/exploratory.py +5 -0
  9. splink-4.0.0.dev8/splink/internals/athena/database_api.py +266 -0
  10. splink-4.0.0.dev8/splink/internals/athena/dataframe.py +119 -0
  11. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/blocking.py +30 -31
  12. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/blocking_analysis.py +104 -22
  13. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/blocking_rule_library.py +29 -2
  14. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/comparison.py +2 -1
  15. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/comparison_creator.py +13 -8
  16. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/comparison_level_composition.py +0 -1
  17. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/comparison_level_library.py +1 -1
  18. splink-4.0.0.dev8/splink/internals/comparison_library.py +1120 -0
  19. splink-4.0.0.dev8/splink/internals/comparison_vector_values.py +96 -0
  20. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/completeness.py +0 -3
  21. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/connected_components.py +8 -42
  22. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/database_api.py +13 -2
  23. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/databricks/enable_splink.py +16 -18
  24. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/dialects.py +21 -10
  25. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/duckdb/dataframe.py +3 -1
  26. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/em_training_session.py +36 -22
  27. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/estimate_u.py +11 -9
  28. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/find_brs_with_comparison_counts_below_threshold.py +18 -1
  29. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/find_matches_to_new_records.py +16 -5
  30. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/input_column.py +5 -5
  31. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/linker.py +15 -8
  32. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/linker_components/clustering.py +22 -28
  33. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/linker_components/evaluation.py +14 -10
  34. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/linker_components/inference.py +183 -82
  35. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/linker_components/misc.py +4 -0
  36. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/linker_components/table_management.py +19 -8
  37. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/linker_components/training.py +79 -47
  38. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/linker_components/visualisations.py +92 -40
  39. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/m_training.py +13 -5
  40. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/predict.py +1 -6
  41. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/profile_data.py +1 -1
  42. splink-4.0.0.dev6/splink/internals/comparison_helpers.py → splink-4.0.0.dev8/splink/internals/similarity_analysis.py +39 -21
  43. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/spark/database_api.py +9 -0
  44. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/splink_dataframe.py +8 -8
  45. splink-4.0.0.dev8/splink/internals/sqlite/__init__.py +0 -0
  46. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/term_frequencies.py +3 -1
  47. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/unlinkables.py +1 -1
  48. splink-4.0.0.dev6/splink/comparison_template_library.py +0 -15
  49. splink-4.0.0.dev6/splink/exploratory.py +0 -4
  50. splink-4.0.0.dev6/splink/internals/athena/linker.py +0 -563
  51. splink-4.0.0.dev6/splink/internals/comparison_library.py +0 -646
  52. splink-4.0.0.dev6/splink/internals/comparison_template_library.py +0 -666
  53. splink-4.0.0.dev6/splink/internals/comparison_vector_values.py +0 -30
  54. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/LICENSE +0 -0
  55. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/backends/spark.py +0 -0
  56. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/backends/sqlite.py +0 -0
  57. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/comparison_level_library.py +0 -0
  58. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/datasets.py +0 -0
  59. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/__init__.py +0 -0
  60. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/accuracy.py +0 -0
  61. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/athena/__init__.py +0 -0
  62. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/athena/athena_helpers/__init__.py +0 -0
  63. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/athena/athena_helpers/athena_transforms.py +0 -0
  64. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/athena/athena_helpers/athena_utils.py +0 -0
  65. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/block_from_labels.py +0 -0
  66. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/blocking_rule_creator.py +0 -0
  67. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/blocking_rule_creator_utils.py +0 -0
  68. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/cache_dict_with_logging.py +0 -0
  69. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/charts.py +0 -0
  70. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/cluster_studio.py +0 -0
  71. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/column_expression.py +0 -0
  72. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/comparison_level.py +0 -0
  73. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/comparison_level_creator.py +0 -0
  74. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/comparison_level_sql.py +0 -0
  75. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/comparison_vector_distribution.py +0 -0
  76. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/constants.py +0 -0
  77. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/cost_of_blocking_rules.py +0 -0
  78. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/databricks/__init__.py +0 -0
  79. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/datasets/__init__.py +0 -0
  80. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/default_from_jsonschema.py +0 -0
  81. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/duckdb/__init__.py +0 -0
  82. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/duckdb/database_api.py +0 -0
  83. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/duckdb/duckdb_helpers/__init__.py +0 -0
  84. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/duckdb/duckdb_helpers/duckdb_helpers.py +0 -0
  85. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/edge_metrics.py +0 -0
  86. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/exceptions.py +0 -0
  87. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/expectation_maximisation.py +0 -0
  88. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/DEPENDENCY_LICENSES.txt +0 -0
  89. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/accuracy_chart.json +0 -0
  90. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/blocking_rule_generated_comparisons.json +0 -0
  91. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/comparator_score_chart.json +0 -0
  92. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/comparator_score_threshold_chart.json +0 -0
  93. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/completeness.json +0 -0
  94. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/m_u_parameters_interactive_history.json +0 -0
  95. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/match_weight_histogram.json +0 -0
  96. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/match_weights_interactive_history.json +0 -0
  97. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/match_weights_waterfall.json +0 -0
  98. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/missingness.json +0 -0
  99. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/parameter_estimate_comparisons.json +0 -0
  100. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/phonetic_match_chart.json +0 -0
  101. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/precision_recall.json +0 -0
  102. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/probability_two_random_records_match_iteration.json +0 -0
  103. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/profile_data.json +0 -0
  104. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/roc.json +0 -0
  105. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/tf_adjustment_chart.json +0 -0
  106. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/threshold_selection_tool.json +0 -0
  107. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/chart_defs/unlinkables_chart_def.json +0 -0
  108. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/external_js/d3@7.8.5 +0 -0
  109. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/external_js/stdlib.js@5.8.3 +0 -0
  110. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/external_js/vega-embed@6.20.2 +0 -0
  111. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/external_js/vega-lite@5.2.0 +0 -0
  112. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/external_js/vega@5.21.0 +0 -0
  113. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/labelling_tool/slt.js +0 -0
  114. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/labelling_tool/template.j2 +0 -0
  115. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/settings_jsonschema.json +0 -0
  116. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_classic.jar +0 -0
  117. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.0_spark3.3.jar +0 -0
  118. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/spark_jars/scala-udf-similarity-0.1.1_spark3.x.jar +0 -0
  119. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/splink_cluster_studio/cluster_template.j2 +0 -0
  120. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/splink_cluster_studio/custom.css +0 -0
  121. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/splink_comparison_viewer/custom.css +0 -0
  122. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/splink_comparison_viewer/template.j2 +0 -0
  123. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/splink_vis_utils/splink_vis_utils.js +0 -0
  124. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/files/templates/single_chart_template.html +0 -0
  125. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/graph_metrics.py +0 -0
  126. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/labelling_tool.py +0 -0
  127. {splink-4.0.0.dev6/splink/internals/postgres → splink-4.0.0.dev8/splink/internals/linker_components}/__init__.py +0 -0
  128. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/logging_messages.py +0 -0
  129. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/lower_id_on_lhs.py +0 -0
  130. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/m_from_labels.py +0 -0
  131. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/m_u_records_to_parameters.py +0 -0
  132. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/match_key_analysis.py +0 -0
  133. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/match_weights_histogram.py +0 -0
  134. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/misc.py +0 -0
  135. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/optimise_cost_of_brs.py +0 -0
  136. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/parse_sql.py +0 -0
  137. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/pipeline.py +0 -0
  138. {splink-4.0.0.dev6/splink/internals/settings_validation → splink-4.0.0.dev8/splink/internals/postgres}/__init__.py +0 -0
  139. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/postgres/database_api.py +0 -0
  140. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/postgres/dataframe.py +0 -0
  141. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/settings.py +0 -0
  142. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/settings_creator.py +0 -0
  143. {splink-4.0.0.dev6/splink/internals/spark → splink-4.0.0.dev8/splink/internals/settings_validation}/__init__.py +0 -0
  144. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/settings_validation/log_invalid_columns.py +0 -0
  145. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/settings_validation/settings_column_cleaner.py +0 -0
  146. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/settings_validation/settings_validation_log_strings.py +0 -0
  147. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/settings_validation/valid_types.py +0 -0
  148. {splink-4.0.0.dev6/splink/internals/spark/spark_helpers → splink-4.0.0.dev8/splink/internals/spark}/__init__.py +0 -0
  149. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/spark/dataframe.py +0 -0
  150. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/spark/jar_location.py +0 -0
  151. {splink-4.0.0.dev6/splink/internals/sqlite → splink-4.0.0.dev8/splink/internals/spark/spark_helpers}/__init__.py +0 -0
  152. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/spark/spark_helpers/custom_spark_dialect.py +0 -0
  153. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/splink_comparison_viewer.py +0 -0
  154. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/sql_transform.py +0 -0
  155. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/sqlite/database_api.py +0 -0
  156. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/sqlite/dataframe.py +0 -0
  157. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/unique_id_concat.py +0 -0
  158. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/validate_jsonschema.py +0 -0
  159. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/vertically_concatenate.py +0 -0
  160. {splink-4.0.0.dev6 → splink-4.0.0.dev8}/splink/internals/waterfall_chart.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: splink
3
- Version: 4.0.0.dev6
3
+ Version: 4.0.0.dev8
4
4
  Summary: Fast probabilistic data linkage at scale
5
5
  Home-page: https://github.com/moj-analytical-services/splink
6
6
  License: MIT
@@ -27,7 +27,6 @@ Requires-Dist: jsonschema (>=3.2)
27
27
  Requires-Dist: numpy (>=1.17.3) ; python_version < "3.12"
28
28
  Requires-Dist: numpy (>=1.26.0) ; python_version >= "3.12"
29
29
  Requires-Dist: pandas (>1.3.5)
30
- Requires-Dist: phonetics (>=1.0.5)
31
30
  Requires-Dist: psycopg2-binary (>=2.8.0) ; extra == "postgres"
32
31
  Requires-Dist: pyspark (>=3.2.1) ; extra == "pyspark" or extra == "spark"
33
32
  Requires-Dist: sqlglot (>=13.0.0)
@@ -51,11 +50,11 @@ Splink is a Python package for probabilistic record linkage (entity resolution)
51
50
 
52
51
  ## Key Features
53
52
 
54
- ⚡ **Speed:** Capable of linking a million records on a laptop in around a minute.
55
- 🎯 **Accuracy:** Support for term frequency adjustments and user-defined fuzzy matching logic.
56
- 🌐 **Scalability:** Execute linkage in Python (using DuckDB) or big-data backends like AWS Athena or Spark for 100+ million records.
57
- 🎓 **Unsupervised Learning:** No training data is required for model training.
58
- 📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.
53
+ ⚡ **Speed:** Capable of linking a million records on a laptop in around a minute.<br>
54
+ 🎯 **Accuracy:** Support for term frequency adjustments and user-defined fuzzy matching logic.<br>
55
+ 🌐 **Scalability:** Execute linkage in Python (using DuckDB) or big-data backends like AWS Athena or Spark for 100+ million records.<br>
56
+ 🎓 **Unsupervised Learning:** No training data is required for model training.<br>
57
+ 📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.<br>
59
58
 
60
59
  Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customisations to improve accuracy.
61
60
 
@@ -75,19 +74,16 @@ and clusters these links to produce an estimated person ID:
75
74
 
76
75
  ## What data does Splink work best with?
77
76
 
78
- Before using Splink, input data should be standardised, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
79
-
80
77
  Splink performs best with input data containing **multiple** columns that are **not highly correlated**. For instance, if the entity type is persons, you may have columns for full name, date of birth, and city. If the entity type is companies, you could have columns for name, turnover, sector, and telephone number.
81
78
 
82
- High correlation occurs when the value of a column is highly constrained (predictable) from the value of another column. For example, a 'city' field is almost perfectly correlated with 'postcode'. Gender is highly correlated with 'first name'. Correlation is particularly problematic if **all** of your input columns are highly correlated.
79
+ High correlation occurs when one column is highly predictable from another - for instance, city can be predicted from postcode. Correlation is particularly problematic if **all** of your input columns are highly correlated.
83
80
 
84
81
  Splink is not designed for linking a single column containing a 'bag of words'. For example, a table with a single 'company name' column, and no other details.
85
82
 
86
83
  ## Documentation
87
84
 
88
- The homepage for the Splink documentation can be found [here](https://moj-analytical-services.github.io/splink/). Interactive demos can be found [here](https://github.com/moj-analytical-services/splink/tree/master/docs/demos), or by clicking the following Binder link:
85
+ The homepage for the Splink documentation can be found [here](https://moj-analytical-services.github.io/splink/), including a [tutorial](https://moj-analytical-services.github.io/splink/demos/tutorials/00_Tutorial_Introduction.html) and [examples](https://moj-analytical-services.github.io/splink/demos/examples/examples_index.html) that can be run in the browser.
89
86
 
90
- [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/moj-analytical-services/splink/binder_branch?labpath=docs%2Fdemos%2Ftutorials%2F00_Tutorial_Introduction.ipynb)
91
87
 
92
88
  The specification of the Fellegi Sunter statistical model behind `splink` is similar as that used in the R [fastLink package](https://github.com/kosukeimai/fastLink). Accompanying the fastLink package is an [academic paper](http://imai.fas.harvard.edu/research/files/linkage.pdf) that describes this model. The [Splink documentation site](https://moj-analytical-services.github.io/splink/topic_guides/fellegi_sunter.html) and a [series of interactive articles](https://www.robinlinacre.com/probabilistic_linkage/) also explores the theory behind Splink.
93
89
 
@@ -143,43 +139,56 @@ The following code demonstrates how to estimate the parameters of a deduplicatio
143
139
  For more detailed tutorial, please see [here](https://moj-analytical-services.github.io/splink/demos/tutorials/00_Tutorial_Introduction.html).
144
140
 
145
141
  ```py
146
- from splink.duckdb.linker import DuckDBLinker
147
- import splink.duckdb.comparison_library as cl
148
- import splink.duckdb.comparison_template_library as ctl
149
- from splink.duckdb.blocking_rule_library import block_on
150
- from splink.datasets import splink_datasets
142
+ import splink.comparison_library as cl
143
+ import splink.comparison_template_library as ctl
144
+ from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
145
+
146
+ db_api = DuckDBAPI()
151
147
 
152
148
  df = splink_datasets.fake_1000
153
149
 
154
- settings = {
155
- "link_type": "dedupe_only",
156
- "blocking_rules_to_generate_predictions": [
150
+ settings = SettingsCreator(
151
+ link_type="dedupe_only",
152
+ comparisons=[
153
+ cl.JaroWinklerAtThresholds("first_name", [0.9, 0.7]),
154
+ cl.JaroAtThresholds("surname", [0.9, 0.7]),
155
+ ctl.DateComparison(
156
+ "dob",
157
+ input_is_string=True,
158
+ datetime_metrics=["year", "month"],
159
+ datetime_thresholds=[1, 1],
160
+ ),
161
+ cl.ExactMatch("city").configure(term_frequency_adjustments=True),
162
+ ctl.EmailComparison("email"),
163
+ ],
164
+ blocking_rules_to_generate_predictions=[
157
165
  block_on("first_name"),
158
166
  block_on("surname"),
159
- ],
160
- "comparisons": [
161
- ctl.name_comparison("first_name"),
162
- ctl.name_comparison("surname"),
163
- ctl.date_comparison("dob", cast_strings_to_date=True),
164
- cl.exact_match("city", term_frequency_adjustments=True),
165
- ctl.email_comparison("email", include_username_fuzzy_level=False),
166
- ],
167
- }
167
+ ]
168
+ )
169
+
170
+ linker = Linker(df, settings, db_api)
171
+
172
+ linker.training.estimate_probability_two_random_records_match(
173
+ [block_on("first_name", "surname")],
174
+ recall=0.7,
175
+ )
168
176
 
169
- linker = DuckDBLinker(df, settings)
170
- linker.estimate_u_using_random_sampling(max_pairs=1e6)
177
+ linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
171
178
 
172
- blocking_rule_for_training = block_on(["first_name", "surname"])
179
+ linker.training.estimate_parameters_using_expectation_maximisation(
180
+ block_on("first_name", "surname")
181
+ )
173
182
 
174
- linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
183
+ linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
175
184
 
176
- blocking_rule_for_training = block_on("dob")
177
- linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
185
+ pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)
178
186
 
179
- pairwise_predictions = linker.predict()
187
+ clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
188
+ pairwise_predictions, 0.95
189
+ )
180
190
 
181
- clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
182
- clusters.as_pandas_dataframe(limit=5)
191
+ df_clusters = clusters.as_pandas_dataframe(limit=5)
183
192
  ```
184
193
 
185
194
  ## Videos
@@ -187,13 +196,10 @@ clusters.as_pandas_dataframe(limit=5)
187
196
  - [A introductory presentation on Splink](https://www.youtube.com/watch?v=msz3T741KQI)
188
197
  - [An introduction to the Splink Comparison Viewer dashboard](https://www.youtube.com/watch?v=DNvCMqjipis)
189
198
 
190
- ## Charts Gallery
191
-
192
- You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](https://moj-analytical-services.github.io/splink/charts/index.html).
193
199
 
194
200
  ## Support
195
201
 
196
- To find the best place to ask a question, report a bug or get general advice, please refer to our [Contributing Guide](./CONTRIBUTING.md).
202
+ To find the best place to ask a question, report a bug or get general advice, please refer to our [Guide](./CONTRIBUTING.md).
197
203
 
198
204
  ## Use Cases
199
205
 
@@ -201,8 +207,6 @@ To see how users are using Splink in the wild, check out the [Use Cases](https:/
201
207
 
202
208
  ## Awards
203
209
 
204
- ❓ Future of Government Awards 2023: Open Source Creation - [Shortlisted, result to be announced shortly](https://futureofgovernment.com/en)
205
-
206
210
  🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
207
211
 
208
212
  🥇 Analysis in Government Awards 2022: People's Choice Award - [Winner](https://analysisfunction.civilservice.gov.uk/news/announcing-the-winner-of-the-first-analysis-in-government-peoples-choice-award/)
@@ -15,11 +15,11 @@ Splink is a Python package for probabilistic record linkage (entity resolution)
15
15
 
16
16
  ## Key Features
17
17
 
18
- ⚡ **Speed:** Capable of linking a million records on a laptop in around a minute.
19
- 🎯 **Accuracy:** Support for term frequency adjustments and user-defined fuzzy matching logic.
20
- 🌐 **Scalability:** Execute linkage in Python (using DuckDB) or big-data backends like AWS Athena or Spark for 100+ million records.
21
- 🎓 **Unsupervised Learning:** No training data is required for model training.
22
- 📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.
18
+ ⚡ **Speed:** Capable of linking a million records on a laptop in around a minute.<br>
19
+ 🎯 **Accuracy:** Support for term frequency adjustments and user-defined fuzzy matching logic.<br>
20
+ 🌐 **Scalability:** Execute linkage in Python (using DuckDB) or big-data backends like AWS Athena or Spark for 100+ million records.<br>
21
+ 🎓 **Unsupervised Learning:** No training data is required for model training.<br>
22
+ 📊 **Interactive Outputs:** A suite of interactive visualisations help users understand their model and diagnose problems.<br>
23
23
 
24
24
  Splink's linkage algorithm is based on Fellegi-Sunter's model of record linkage, with various customisations to improve accuracy.
25
25
 
@@ -39,19 +39,16 @@ and clusters these links to produce an estimated person ID:
39
39
 
40
40
  ## What data does Splink work best with?
41
41
 
42
- Before using Splink, input data should be standardised, with consistent column names and formatting (e.g., lowercased, punctuation cleaned up, etc.).
43
-
44
42
  Splink performs best with input data containing **multiple** columns that are **not highly correlated**. For instance, if the entity type is persons, you may have columns for full name, date of birth, and city. If the entity type is companies, you could have columns for name, turnover, sector, and telephone number.
45
43
 
46
- High correlation occurs when the value of a column is highly constrained (predictable) from the value of another column. For example, a 'city' field is almost perfectly correlated with 'postcode'. Gender is highly correlated with 'first name'. Correlation is particularly problematic if **all** of your input columns are highly correlated.
44
+ High correlation occurs when one column is highly predictable from another - for instance, city can be predicted from postcode. Correlation is particularly problematic if **all** of your input columns are highly correlated.
47
45
 
48
46
  Splink is not designed for linking a single column containing a 'bag of words'. For example, a table with a single 'company name' column, and no other details.
49
47
 
50
48
  ## Documentation
51
49
 
52
- The homepage for the Splink documentation can be found [here](https://moj-analytical-services.github.io/splink/). Interactive demos can be found [here](https://github.com/moj-analytical-services/splink/tree/master/docs/demos), or by clicking the following Binder link:
50
+ The homepage for the Splink documentation can be found [here](https://moj-analytical-services.github.io/splink/), including a [tutorial](https://moj-analytical-services.github.io/splink/demos/tutorials/00_Tutorial_Introduction.html) and [examples](https://moj-analytical-services.github.io/splink/demos/examples/examples_index.html) that can be run in the browser.
53
51
 
54
- [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/moj-analytical-services/splink/binder_branch?labpath=docs%2Fdemos%2Ftutorials%2F00_Tutorial_Introduction.ipynb)
55
52
 
56
53
  The specification of the Fellegi Sunter statistical model behind `splink` is similar as that used in the R [fastLink package](https://github.com/kosukeimai/fastLink). Accompanying the fastLink package is an [academic paper](http://imai.fas.harvard.edu/research/files/linkage.pdf) that describes this model. The [Splink documentation site](https://moj-analytical-services.github.io/splink/topic_guides/fellegi_sunter.html) and a [series of interactive articles](https://www.robinlinacre.com/probabilistic_linkage/) also explores the theory behind Splink.
57
54
 
@@ -107,43 +104,56 @@ The following code demonstrates how to estimate the parameters of a deduplicatio
107
104
  For more detailed tutorial, please see [here](https://moj-analytical-services.github.io/splink/demos/tutorials/00_Tutorial_Introduction.html).
108
105
 
109
106
  ```py
110
- from splink.duckdb.linker import DuckDBLinker
111
- import splink.duckdb.comparison_library as cl
112
- import splink.duckdb.comparison_template_library as ctl
113
- from splink.duckdb.blocking_rule_library import block_on
114
- from splink.datasets import splink_datasets
107
+ import splink.comparison_library as cl
108
+ import splink.comparison_template_library as ctl
109
+ from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
110
+
111
+ db_api = DuckDBAPI()
115
112
 
116
113
  df = splink_datasets.fake_1000
117
114
 
118
- settings = {
119
- "link_type": "dedupe_only",
120
- "blocking_rules_to_generate_predictions": [
115
+ settings = SettingsCreator(
116
+ link_type="dedupe_only",
117
+ comparisons=[
118
+ cl.JaroWinklerAtThresholds("first_name", [0.9, 0.7]),
119
+ cl.JaroAtThresholds("surname", [0.9, 0.7]),
120
+ ctl.DateComparison(
121
+ "dob",
122
+ input_is_string=True,
123
+ datetime_metrics=["year", "month"],
124
+ datetime_thresholds=[1, 1],
125
+ ),
126
+ cl.ExactMatch("city").configure(term_frequency_adjustments=True),
127
+ ctl.EmailComparison("email"),
128
+ ],
129
+ blocking_rules_to_generate_predictions=[
121
130
  block_on("first_name"),
122
131
  block_on("surname"),
123
- ],
124
- "comparisons": [
125
- ctl.name_comparison("first_name"),
126
- ctl.name_comparison("surname"),
127
- ctl.date_comparison("dob", cast_strings_to_date=True),
128
- cl.exact_match("city", term_frequency_adjustments=True),
129
- ctl.email_comparison("email", include_username_fuzzy_level=False),
130
- ],
131
- }
132
+ ]
133
+ )
134
+
135
+ linker = Linker(df, settings, db_api)
136
+
137
+ linker.training.estimate_probability_two_random_records_match(
138
+ [block_on("first_name", "surname")],
139
+ recall=0.7,
140
+ )
132
141
 
133
- linker = DuckDBLinker(df, settings)
134
- linker.estimate_u_using_random_sampling(max_pairs=1e6)
142
+ linker.training.estimate_u_using_random_sampling(max_pairs=1e6)
135
143
 
136
- blocking_rule_for_training = block_on(["first_name", "surname"])
144
+ linker.training.estimate_parameters_using_expectation_maximisation(
145
+ block_on("first_name", "surname")
146
+ )
137
147
 
138
- linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
148
+ linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
139
149
 
140
- blocking_rule_for_training = block_on("dob")
141
- linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
150
+ pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)
142
151
 
143
- pairwise_predictions = linker.predict()
152
+ clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
153
+ pairwise_predictions, 0.95
154
+ )
144
155
 
145
- clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
146
- clusters.as_pandas_dataframe(limit=5)
156
+ df_clusters = clusters.as_pandas_dataframe(limit=5)
147
157
  ```
148
158
 
149
159
  ## Videos
@@ -151,13 +161,10 @@ clusters.as_pandas_dataframe(limit=5)
151
161
  - [A introductory presentation on Splink](https://www.youtube.com/watch?v=msz3T741KQI)
152
162
  - [An introduction to the Splink Comparison Viewer dashboard](https://www.youtube.com/watch?v=DNvCMqjipis)
153
163
 
154
- ## Charts Gallery
155
-
156
- You can see all of the interactive charts provided in Splink by checking out the [Charts Gallery](https://moj-analytical-services.github.io/splink/charts/index.html).
157
164
 
158
165
  ## Support
159
166
 
160
- To find the best place to ask a question, report a bug or get general advice, please refer to our [Contributing Guide](./CONTRIBUTING.md).
167
+ To find the best place to ask a question, report a bug or get general advice, please refer to our [Guide](./CONTRIBUTING.md).
161
168
 
162
169
  ## Use Cases
163
170
 
@@ -165,8 +172,6 @@ To see how users are using Splink in the wild, check out the [Use Cases](https:/
165
172
 
166
173
  ## Awards
167
174
 
168
- ❓ Future of Government Awards 2023: Open Source Creation - [Shortlisted, result to be announced shortly](https://futureofgovernment.com/en)
169
-
170
175
  🥈 Civil Service Awards 2023: Best Use of Data, Science, and Technology - [Runner up](https://www.civilserviceawards.com/best-use-of-data-science-and-technology-award-2/)
171
176
 
172
177
  🥇 Analysis in Government Awards 2022: People's Choice Award - [Winner](https://analysisfunction.civilservice.gov.uk/news/announcing-the-winner-of-the-first-analysis-in-government-peoples-choice-award/)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "splink"
3
- version = "4.0.0.dev6"
3
+ version = "4.0.0.dev8"
4
4
  description = "Fast probabilistic data linkage at scale"
5
5
  authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth", "Andy Bond", "Ross Kennedy"]
6
6
  license = "MIT"
@@ -17,7 +17,6 @@ duckdb = ">=0.9.2"
17
17
  sqlglot = ">=13.0.0"
18
18
  altair = "^5.0.1"
19
19
  Jinja2 = ">=3.0.3"
20
- phonetics = ">=1.0.5"
21
20
 
22
21
  # need to manually specify numpy versions suitable for CI
23
22
  # 1.24.4 works with python 3.8, but not 3.12
@@ -44,7 +44,7 @@ def __getattr__(name):
44
44
  raise AttributeError(f"module 'splink' has no attribute '{name}'") from None
45
45
 
46
46
 
47
- __version__ = "4.0.0.dev6"
47
+ __version__ = "4.0.0.dev8"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -0,0 +1,3 @@
1
+ from splink.internals.athena.database_api import AthenaAPI
2
+
3
+ __all__ = ["AthenaAPI"]
@@ -2,10 +2,12 @@ from .internals.blocking_analysis import (
2
2
  count_comparisons_from_blocking_rule,
3
3
  cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
4
4
  cumulative_comparisons_to_be_scored_from_blocking_rules_data,
5
+ n_largest_blocks,
5
6
  )
6
7
 
7
8
  __all__ = [
8
9
  "count_comparisons_from_blocking_rule",
9
10
  "cumulative_comparisons_to_be_scored_from_blocking_rules_chart",
10
11
  "cumulative_comparisons_to_be_scored_from_blocking_rules_data",
12
+ "n_largest_blocks",
11
13
  ]
@@ -4,13 +4,18 @@ from splink.internals.comparison_library import (
4
4
  ArrayIntersectAtSizes,
5
5
  CustomComparison,
6
6
  DamerauLevenshteinAtThresholds,
7
+ DateOfBirthComparison,
7
8
  DistanceFunctionAtThresholds,
8
9
  DistanceInKMAtThresholds,
10
+ EmailComparison,
9
11
  ExactMatch,
12
+ ForenameSurnameComparison,
10
13
  JaccardAtThresholds,
11
14
  JaroAtThresholds,
12
15
  JaroWinklerAtThresholds,
13
16
  LevenshteinAtThresholds,
17
+ NameComparison,
18
+ PostcodeComparison,
14
19
  )
15
20
 
16
21
  __all__ = [
@@ -26,4 +31,9 @@ __all__ = [
26
31
  "AbsoluteDateDifferenceAtThresholds",
27
32
  "ArrayIntersectAtSizes",
28
33
  "DistanceInKMAtThresholds",
34
+ "DateOfBirthComparison",
35
+ "EmailComparison",
36
+ "ForenameSurnameComparison",
37
+ "NameComparison",
38
+ "PostcodeComparison",
29
39
  ]
@@ -0,0 +1,5 @@
1
+ from .internals import similarity_analysis
2
+ from .internals.completeness import completeness_chart
3
+ from .internals.profile_data import profile_columns
4
+
5
+ __all__ = ["completeness_chart", "profile_columns", "similarity_analysis"]
@@ -0,0 +1,266 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ from typing import Any, Sequence
7
+
8
+ import awswrangler as wr
9
+ import boto3
10
+ import pandas as pd
11
+
12
+ from ..database_api import AcceptableInputTableType, DatabaseAPI
13
+ from ..dialects import AthenaDialect
14
+ from ..sql_transform import sqlglot_transform_sql
15
+ from .athena_helpers.athena_transforms import cast_concat_as_varchar
16
+ from .athena_helpers.athena_utils import (
17
+ _verify_athena_inputs,
18
+ )
19
+ from .dataframe import AthenaDataFrame
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ # Dict because there's not really a 'tablish' type in Athena
25
+ class AthenaAPI(DatabaseAPI[dict[str, Any]]):
26
+ sql_dialect = AthenaDialect()
27
+
28
+ def __init__(
29
+ self,
30
+ boto3_session: boto3.session.Session,
31
+ output_database: str,
32
+ output_bucket: str,
33
+ output_filepath: str = None,
34
+ ):
35
+ super().__init__()
36
+ if not type(boto3_session) == boto3.session.Session:
37
+ raise ValueError("Please enter a valid boto3 session object.")
38
+
39
+ self._sql_dialect_ = "presto"
40
+
41
+ _verify_athena_inputs(output_database, output_bucket, boto3_session)
42
+ self.boto3_session = boto3_session
43
+ self.output_schema = output_database
44
+ self.output_bucket = output_bucket
45
+
46
+ # If the default folder is blank, name it `splink_warehouse`
47
+ if output_filepath:
48
+ self.output_filepath = output_filepath
49
+ else:
50
+ self.output_filepath = "splink_warehouse"
51
+
52
+ self.ctas_query_info: dict[str, Any] = {}
53
+
54
+ # TODO: How to run this check without the input_tables?
55
+ # Run a quick check against our inputs to check if they
56
+ # exist in the database
57
+ # for table in input_tables:
58
+ # if not isinstance(table, self.accepted_df_dtypes):
59
+ # db, tb = self.get_schema_info(table)
60
+ # self._check_table_exists(db, tb)
61
+
62
+ @property
63
+ def s3_output(self):
64
+ out_path = os.path.join(
65
+ "s3://",
66
+ self.output_bucket,
67
+ self.output_filepath,
68
+ self._cache_uid, # added in the super() step
69
+ )
70
+ if out_path[-1] != "/":
71
+ out_path += "/"
72
+
73
+ return out_path
74
+
75
+ # TODO: Should output_filepath use getters and setters?
76
+ def change_output_filepath(self, new_filepath):
77
+ self.output_filepath = new_filepath
78
+
79
+ def get_schema_info(self, input_table: str) -> list[str]:
80
+ t = input_table.split(".")
81
+ return t if len(t) > 1 else [self.output_schema, input_table]
82
+
83
+ def _check_table_exists(self, db: str, tb: str) -> None:
84
+ # A quick function to check if a table exists
85
+ # and spit out a warning if it is not found.
86
+ table_exists = wr.catalog.does_table_exist(
87
+ database=db,
88
+ table=tb,
89
+ boto3_session=self.boto3_session,
90
+ )
91
+ if not table_exists:
92
+ raise wr.exceptions.InvalidTable(
93
+ f"Table '{tb}' was not found within your selected "
94
+ f"database '{db}'. Please verify your input table "
95
+ "exists."
96
+ )
97
+
98
+ def _drop_table_from_database_if_exists(self, table):
99
+ return wr.catalog.delete_table_if_exists(
100
+ database=self.output_schema, table=table, boto3_session=self.boto3_session
101
+ )
102
+
103
+ def _delete_table_from_s3(self, physical_name):
104
+ path = f"{self.s3_output}{physical_name}/"
105
+ # delete our folder
106
+ wr.s3.delete_objects(
107
+ path=path,
108
+ use_threads=True,
109
+ boto3_session=self.boto3_session,
110
+ )
111
+
112
+ metadata = self.ctas_query_info[physical_name]
113
+ if "output_location" in metadata:
114
+ metadata_urls = [
115
+ # metadata output location
116
+ f"{metadata['output_location']}.metadata",
117
+ # manifest location
118
+ metadata["manifest_location"],
119
+ ]
120
+ # delete our metadata
121
+ wr.s3.delete_objects(
122
+ path=metadata_urls,
123
+ use_threads=True,
124
+ boto3_session=self.boto3_session,
125
+ )
126
+
127
+ self.ctas_query_info.pop(physical_name)
128
+
129
+ def delete_table_from_database(self, name):
130
+ if name in self.ctas_query_info:
131
+ # Use ctas metadata to delete backing data
132
+ self._delete_table_from_s3(name)
133
+ else:
134
+ # If the location we want to write to already exists,
135
+ # clean this before continuing.
136
+ loc = f"{self.s3_output}{name}"
137
+ folder_exists = wr.s3.list_directories(
138
+ loc,
139
+ boto3_session=self.boto3_session,
140
+ )
141
+ if folder_exists:
142
+ # This will only delete objects we are required to delete
143
+ wr.s3.delete_objects(
144
+ path=loc,
145
+ use_threads=True,
146
+ boto3_session=self.boto3_session,
147
+ )
148
+
149
+ self._drop_table_from_database_if_exists(name)
150
+
151
+ def _register_data_on_s3(self, table, alias):
152
+ out_loc = f"{self.s3_output}{alias}"
153
+
154
+ wr.s3.to_parquet(
155
+ df=table,
156
+ path=out_loc,
157
+ dataset=True,
158
+ mode="overwrite",
159
+ database=self.output_schema,
160
+ table=alias,
161
+ boto3_session=self.boto3_session,
162
+ compression="snappy",
163
+ use_threads=True,
164
+ )
165
+ # Construct the ctas metadata that we require
166
+ ctas_metadata = {
167
+ "ctas_database": self.output_schema,
168
+ "ctas_table": alias,
169
+ }
170
+ self.ctas_query_info.update({alias: ctas_metadata})
171
+
172
+ def _table_registration(self, input, table_name):
173
+ if isinstance(input, dict):
174
+ input = pd.DataFrame(input)
175
+ elif isinstance(input, list):
176
+ input = pd.DataFrame.from_records(input)
177
+
178
+ # Errors if an invalid data type is passed
179
+ self._register_data_on_s3(input, table_name)
180
+
181
+ def table_to_splink_dataframe(self, templated_name, physical_name):
182
+ return AthenaDataFrame(templated_name, physical_name, self)
183
+
184
+ def _create_table(self, sql, physical_name):
185
+ ctas_metadata = wr.athena.create_ctas_table(
186
+ sql=sql,
187
+ database=self.output_schema,
188
+ ctas_table=physical_name,
189
+ storage_format="parquet",
190
+ write_compression="snappy",
191
+ boto3_session=self.boto3_session,
192
+ s3_output=self.s3_output,
193
+ wait=True,
194
+ )
195
+ return ctas_metadata
196
+
197
+ def table_exists_in_database(self, table_name):
198
+ return wr.catalog.does_table_exist(
199
+ database=self.output_schema,
200
+ table=table_name,
201
+ boto3_session=self.boto3_session,
202
+ )
203
+
204
+ def _extract_ctas_metadata(self, ctas_metadata):
205
+ query_meta = ctas_metadata.pop("ctas_query_metadata")
206
+ out_locs = {
207
+ "output_location": query_meta.output_location,
208
+ "manifest_location": query_meta.manifest_location,
209
+ }
210
+ ctas_metadata.update(out_locs)
211
+ return ctas_metadata
212
+
213
+ def _setup_for_execute_sql(self, sql: str, physical_name: str) -> str:
214
+ self.delete_table_from_database(physical_name)
215
+ # This is a hack because execute_sql_against_backend
216
+ # needs the physical name but the _execute_sql_against_backend
217
+ # method just takes a string
218
+ return json.dumps(
219
+ {
220
+ "physical_name": physical_name,
221
+ "sql": sql,
222
+ }
223
+ )
224
+
225
+ def _execute_sql_against_backend(self, sql):
226
+ sql_dict = json.loads(sql)
227
+ physical_name = sql_dict["physical_name"]
228
+ sql_query = sql_dict["sql"]
229
+ sql_query = sqlglot_transform_sql(
230
+ sql_query, cast_concat_as_varchar, dialect="presto"
231
+ )
232
+ sql_query = sql_query.replace("FLOAT", "double").replace("float", "double")
233
+
234
+ # create our table on athena and extract the metadata information
235
+ query_metadata = self._create_table(sql_query, physical_name=physical_name)
236
+ # append our metadata locations
237
+ query_metadata = self._extract_ctas_metadata(query_metadata)
238
+ self.ctas_query_info.update({physical_name: query_metadata})
239
+
240
+ return query_metadata
241
+
242
+ @property
243
+ def accepted_df_dtypes(self):
244
+ accepted_df_dtypes = [pd.DataFrame]
245
+ try:
246
+ # If pyarrow is installed, add to the accepted list
247
+ import pyarrow as pa
248
+
249
+ accepted_df_dtypes.append(pa.lib.Table)
250
+ except ImportError:
251
+ pass
252
+ return accepted_df_dtypes
253
+
254
+ def load_from_file(self, file_path: str) -> str:
255
+ raise NotImplementedError(
256
+ "Loading from file is not supported for Athena. "
257
+ "Please use the `table` method to load data."
258
+ )
259
+
260
+ def process_input_tables(
261
+ self, input_tables: Sequence[AcceptableInputTableType]
262
+ ) -> Sequence[AcceptableInputTableType]:
263
+ input_tables = super().process_input_tables(input_tables)
264
+ return [
265
+ self.load_from_file(t) if isinstance(t, str) else t for t in input_tables
266
+ ]