mismo 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. mismo-0.3.0/PKG-INFO +111 -0
  2. mismo-0.3.0/README.md +73 -0
  3. mismo-0.3.0/mismo/__init__.py +55 -0
  4. mismo-0.3.0/mismo/_common.py +28 -0
  5. mismo-0.3.0/mismo/_counts_table.py +137 -0
  6. mismo-0.3.0/mismo/_data/_datasets/febrl/dataset1.csv +1001 -0
  7. mismo-0.3.0/mismo/_data/_datasets/febrl/dataset2.csv +5001 -0
  8. mismo-0.3.0/mismo/_data/_datasets/febrl/dataset3.csv +5001 -0
  9. mismo-0.3.0/mismo/_data/_datasets/leipzig/affiliations.csv +2261 -0
  10. mismo-0.3.0/mismo/_data/_datasets/leipzig/make_affiliations.py +102 -0
  11. mismo-0.3.0/mismo/_data/_datasets/patstat/patents.csv +2380 -0
  12. mismo-0.3.0/mismo/_data/_datasets/rldata/RLdata10000.csv +10001 -0
  13. mismo-0.3.0/mismo/_data/_datasets/rldata/RLdata500.csv +501 -0
  14. mismo-0.3.0/mismo/_datasets.py +172 -0
  15. mismo-0.3.0/mismo/_explain.py +37 -0
  16. mismo-0.3.0/mismo/_factorizer.py +152 -0
  17. mismo-0.3.0/mismo/_funcs.py +78 -0
  18. mismo-0.3.0/mismo/_n_naive.py +41 -0
  19. mismo-0.3.0/mismo/_recipe.py +18 -0
  20. mismo-0.3.0/mismo/_registry.py +75 -0
  21. mismo-0.3.0/mismo/_resolve.py +275 -0
  22. mismo-0.3.0/mismo/_structs.py +46 -0
  23. mismo-0.3.0/mismo/_typing.py +24 -0
  24. mismo-0.3.0/mismo/_upset.py +241 -0
  25. mismo-0.3.0/mismo/_util.py +542 -0
  26. mismo-0.3.0/mismo/arrays/__init__.py +19 -0
  27. mismo-0.3.0/mismo/arrays/_array.py +202 -0
  28. mismo-0.3.0/mismo/arrays/_builtins.py +71 -0
  29. mismo-0.3.0/mismo/arrays/tests/test_array.py +203 -0
  30. mismo-0.3.0/mismo/cluster/__init__.py +25 -0
  31. mismo-0.3.0/mismo/cluster/_connected_components.py +314 -0
  32. mismo-0.3.0/mismo/cluster/_dashboard.py +55 -0
  33. mismo-0.3.0/mismo/cluster/_dashboard_internal.py +238 -0
  34. mismo-0.3.0/mismo/cluster/_eval.py +180 -0
  35. mismo-0.3.0/mismo/cluster/_metrics.py +69 -0
  36. mismo-0.3.0/mismo/cluster/_subgraph.py +25 -0
  37. mismo-0.3.0/mismo/cluster/_subgraph_internal.py +42 -0
  38. mismo-0.3.0/mismo/cluster/test/test_connected_components.py +159 -0
  39. mismo-0.3.0/mismo/cluster/test/test_eval.py +62 -0
  40. mismo-0.3.0/mismo/compare/__init__.py +11 -0
  41. mismo-0.3.0/mismo/compare/_comparer.py +22 -0
  42. mismo-0.3.0/mismo/compare/_match_level.py +388 -0
  43. mismo-0.3.0/mismo/compare/_plot.py +334 -0
  44. mismo-0.3.0/mismo/compare/tests/test_match_level.py +211 -0
  45. mismo-0.3.0/mismo/conftest.py +198 -0
  46. mismo-0.3.0/mismo/eda/__init__.py +6 -0
  47. mismo-0.3.0/mismo/eda/_plot.py +167 -0
  48. mismo-0.3.0/mismo/exceptions.py +43 -0
  49. mismo-0.3.0/mismo/fs/__init__.py +11 -0
  50. mismo-0.3.0/mismo/fs/_plot.py +167 -0
  51. mismo-0.3.0/mismo/fs/_train.py +280 -0
  52. mismo-0.3.0/mismo/fs/_train_em.py +93 -0
  53. mismo-0.3.0/mismo/fs/_util.py +60 -0
  54. mismo-0.3.0/mismo/fs/_weights.py +391 -0
  55. mismo-0.3.0/mismo/fs/tests/test_train.py +156 -0
  56. mismo-0.3.0/mismo/fs/tests/test_weights.py +167 -0
  57. mismo-0.3.0/mismo/joins/__init__.py +20 -0
  58. mismo-0.3.0/mismo/joins/_analyze.py +149 -0
  59. mismo-0.3.0/mismo/joins/_conditions.py +326 -0
  60. mismo-0.3.0/mismo/joins/_core.py +147 -0
  61. mismo-0.3.0/mismo/joins/tests/test_conditions.py +47 -0
  62. mismo-0.3.0/mismo/joins/tests/test_join.py +53 -0
  63. mismo-0.3.0/mismo/lib/__init__.py +8 -0
  64. mismo-0.3.0/mismo/lib/email/__init__.py +9 -0
  65. mismo-0.3.0/mismo/lib/email/_core.py +191 -0
  66. mismo-0.3.0/mismo/lib/email/tests/test_core.py +82 -0
  67. mismo-0.3.0/mismo/lib/geo/__init__.py +13 -0
  68. mismo-0.3.0/mismo/lib/geo/_address.py +275 -0
  69. mismo-0.3.0/mismo/lib/geo/_census.py +348 -0
  70. mismo-0.3.0/mismo/lib/geo/_latlon.py +301 -0
  71. mismo-0.3.0/mismo/lib/geo/_postal.py +172 -0
  72. mismo-0.3.0/mismo/lib/geo/_regex_parse.py +686 -0
  73. mismo-0.3.0/mismo/lib/geo/_spacy.py +272 -0
  74. mismo-0.3.0/mismo/lib/geo/tests/.gitignore +1 -0
  75. mismo-0.3.0/mismo/lib/geo/tests/test_address.py +200 -0
  76. mismo-0.3.0/mismo/lib/geo/tests/test_census.py +248 -0
  77. mismo-0.3.0/mismo/lib/geo/tests/test_latlon.py +91 -0
  78. mismo-0.3.0/mismo/lib/geo/tests/test_postal.py +177 -0
  79. mismo-0.3.0/mismo/lib/geo/tests/test_postal_benchmark.py +160 -0
  80. mismo-0.3.0/mismo/lib/geo/tests/test_re_parse.py +432 -0
  81. mismo-0.3.0/mismo/lib/geo/tests/test_spacy.py +57 -0
  82. mismo-0.3.0/mismo/lib/name/__init__.py +10 -0
  83. mismo-0.3.0/mismo/lib/name/_blocker.py +104 -0
  84. mismo-0.3.0/mismo/lib/name/_clean.py +63 -0
  85. mismo-0.3.0/mismo/lib/name/_compare.py +152 -0
  86. mismo-0.3.0/mismo/lib/name/_dimension.py +73 -0
  87. mismo-0.3.0/mismo/lib/name/_nicknames.py +125 -0
  88. mismo-0.3.0/mismo/lib/name/tests/conftest.py +76 -0
  89. mismo-0.3.0/mismo/lib/name/tests/test_name_blocker.py +33 -0
  90. mismo-0.3.0/mismo/lib/name/tests/test_name_dimension.py +36 -0
  91. mismo-0.3.0/mismo/lib/name/tests/test_nicknames.py +52 -0
  92. mismo-0.3.0/mismo/lib/phone/__init__.py +8 -0
  93. mismo-0.3.0/mismo/lib/phone/_core.py +180 -0
  94. mismo-0.3.0/mismo/lib/phone/tests/__init__.py +0 -0
  95. mismo-0.3.0/mismo/lib/phone/tests/test_core.py +79 -0
  96. mismo-0.3.0/mismo/linkage/__init__.py +11 -0
  97. mismo-0.3.0/mismo/linkage/_analyze.py +43 -0
  98. mismo-0.3.0/mismo/linkage/_combine.py +219 -0
  99. mismo-0.3.0/mismo/linkage/_dimension.py +37 -0
  100. mismo-0.3.0/mismo/linkage/_linkage.py +290 -0
  101. mismo-0.3.0/mismo/linkage/_sample.py +117 -0
  102. mismo-0.3.0/mismo/linkage/tests/test_key_blocker_benchmark.py +67 -0
  103. mismo-0.3.0/mismo/linkage/tests/test_sample.py +74 -0
  104. mismo-0.3.0/mismo/linkage/tests/test_slow_join.py +54 -0
  105. mismo-0.3.0/mismo/linker/__init__.py +12 -0
  106. mismo-0.3.0/mismo/linker/_basic.py +95 -0
  107. mismo-0.3.0/mismo/linker/_common.py +28 -0
  108. mismo-0.3.0/mismo/linker/_id_linker.py +126 -0
  109. mismo-0.3.0/mismo/linker/_join_linker.py +64 -0
  110. mismo-0.3.0/mismo/linker/_key_linker.py +471 -0
  111. mismo-0.3.0/mismo/linker/_lsh.py +193 -0
  112. mismo-0.3.0/mismo/linker/_or_linker.py +118 -0
  113. mismo-0.3.0/mismo/linker/tests/test_id_linker.py +123 -0
  114. mismo-0.3.0/mismo/linker/tests/test_join_linker.py +263 -0
  115. mismo-0.3.0/mismo/linker/tests/test_key_linker.py +136 -0
  116. mismo-0.3.0/mismo/linker/tests/test_key_linker_counts.py +129 -0
  117. mismo-0.3.0/mismo/linker/tests/test_linker.py +45 -0
  118. mismo-0.3.0/mismo/linker/tests/test_lsh.py +40 -0
  119. mismo-0.3.0/mismo/linker/tests/test_or_linker.py +242 -0
  120. mismo-0.3.0/mismo/playdata.py +317 -0
  121. mismo-0.3.0/mismo/sets/__init__.py +10 -0
  122. mismo-0.3.0/mismo/sets/_compare.py +23 -0
  123. mismo-0.3.0/mismo/sets/_tfidf.py +315 -0
  124. mismo-0.3.0/mismo/sets/tests/test_compare.py +27 -0
  125. mismo-0.3.0/mismo/sets/tests/test_tfidf.py +143 -0
  126. mismo-0.3.0/mismo/tests/__init__.py +0 -0
  127. mismo-0.3.0/mismo/tests/test_exceptions.py +45 -0
  128. mismo-0.3.0/mismo/tests/test_factorizer.py +106 -0
  129. mismo-0.3.0/mismo/tests/test_funcs.py +109 -0
  130. mismo-0.3.0/mismo/tests/test_n_naive.py +42 -0
  131. mismo-0.3.0/mismo/tests/test_playdata.py +87 -0
  132. mismo-0.3.0/mismo/tests/test_resolve.py +108 -0
  133. mismo-0.3.0/mismo/tests/test_util.py +138 -0
  134. mismo-0.3.0/mismo/tests/test_version.py +7 -0
  135. mismo-0.3.0/mismo/tests/util.py +100 -0
  136. mismo-0.3.0/mismo/text/__init__.py +17 -0
  137. mismo-0.3.0/mismo/text/_features.py +78 -0
  138. mismo-0.3.0/mismo/text/_re_extract.py +72 -0
  139. mismo-0.3.0/mismo/text/_similarity.py +181 -0
  140. mismo-0.3.0/mismo/text/_strings.py +58 -0
  141. mismo-0.3.0/mismo/text/tests/test_features.py +80 -0
  142. mismo-0.3.0/mismo/text/tests/test_re_extract.py +26 -0
  143. mismo-0.3.0/mismo/text/tests/test_similarity.py +80 -0
  144. mismo-0.3.0/mismo/text/tests/test_strings.py +24 -0
  145. mismo-0.3.0/mismo/tf/__init__.py +11 -0
  146. mismo-0.3.0/mismo/tf/_filterer.py +104 -0
  147. mismo-0.3.0/mismo/tf/_tf.py +154 -0
  148. mismo-0.3.0/mismo/types/__init__.py +9 -0
  149. mismo-0.3.0/mismo/types/_diff.py +454 -0
  150. mismo-0.3.0/mismo/types/_linked_table.py +467 -0
  151. mismo-0.3.0/mismo/types/_links_table.py +223 -0
  152. mismo-0.3.0/mismo/types/_union_table.py +51 -0
  153. mismo-0.3.0/mismo/types/_updates.py +378 -0
  154. mismo-0.3.0/mismo/types/_wrapper.py +50 -0
  155. mismo-0.3.0/mismo/types/tests/test_diff.py +137 -0
  156. mismo-0.3.0/mismo/types/tests/test_linkage.py +121 -0
  157. mismo-0.3.0/mismo/types/tests/test_union.py +186 -0
  158. mismo-0.3.0/mismo/types/tests/test_updates.py +74 -0
  159. mismo-0.3.0/mismo/types/tests/test_wrapper.py +69 -0
  160. mismo-0.3.0/mismo/vector/__init__.py +9 -0
  161. mismo-0.3.0/mismo/vector/_vector.py +270 -0
  162. mismo-0.3.0/mismo/vector/tests/test_vector.py +269 -0
  163. mismo-0.3.0/pyproject.toml +196 -0
mismo-0.3.0/PKG-INFO ADDED
@@ -0,0 +1,111 @@
1
+ Metadata-Version: 2.3
2
+ Name: mismo
3
+ Version: 0.3.0
4
+ Summary: The SQL/Ibis powered sklearn of record linkage.
5
+ Keywords: record linkage,entity resolution,fuzzy linking,machine learning,ibis,sql,splink,duckdb
6
+ Author: Nick Crews
7
+ Author-email: Nick Crews <nicholas.b.crews@gmail.com>
8
+ License: LGPL-3.0-or-later
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Programming Language :: Python
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: Implementation :: CPython
16
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
17
+ Requires-Dist: ibis-framework>=9.1.0
18
+ Requires-Dist: sqlglot>=25.29.0
19
+ Requires-Dist: typing-extensions>=4.0.0 ; python_full_version < '3.11'
20
+ Requires-Dist: scikit-learn>=1.5.2 ; extra == 'metrics'
21
+ Requires-Dist: postal>=1.1.7,<1.1.11 ; (sys_platform == 'darwin' and extra == 'postal') or (sys_platform == 'linux' and extra == 'postal')
22
+ Requires-Dist: en-us-address-ner-sm ; extra == 'spacy'
23
+ Requires-Dist: spacy>=3.8.2 ; extra == 'spacy'
24
+ Requires-Dist: altair>=5.0.0 ; extra == 'viz'
25
+ Requires-Dist: ipywidgets>=7.5.1 ; extra == 'viz'
26
+ Requires-Dist: solara-ui>=1.51.0 ; extra == 'viz'
27
+ Requires-Dist: anywidget>=0.9.18 ; extra == 'viz'
28
+ Requires-Python: >=3.10
29
+ Project-URL: Documentation, https://nickcrews.github.io/mismo
30
+ Project-URL: Homepage, https://github.com/NickCrews/mismo
31
+ Project-URL: Issues, https://github.com/NickCrews/mismo/issues
32
+ Project-URL: Source, https://github.com/NickCrews/mismo
33
+ Provides-Extra: metrics
34
+ Provides-Extra: postal
35
+ Provides-Extra: spacy
36
+ Provides-Extra: viz
37
+ Description-Content-Type: text/markdown
38
+
39
+ # Mismo
40
+
41
+ [![PyPI - Version](https://img.shields.io/pypi/v/mismo.svg)](https://pypi.org/project/mismo)
42
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mismo.svg)](https://pypi.org/project/mismo)
43
+
44
+ The SQL/Ibis powered sklearn of record linkage.
45
+
46
+ Still in alpha stage. Breaking changes will happen frequently
47
+ and with no warning. Once things are more stabilized I
48
+ will come up with a stability policy. Any suggestions as
49
+ to how you want the API to look like would be greatly appreciated.
50
+ I do use this in my work, so at least I do decent job of
51
+ ensuring correctness.
52
+
53
+ -----
54
+
55
+ ## Goals
56
+
57
+ Mismo tries to be the sklearn of record linkage, backed by the scalability
58
+ and power of SQL and [Ibis](https://ibis-project.org/). It is made of many small
59
+ data structures and functions, each with a well-defined and standard API
60
+ that allows them to be composed together and extended easily.
61
+ None of the other record linkage packages I have seen, such as
62
+ [Splink](https://github.com/moj-analytical-services/splink),
63
+ [Dedupe](https://www.github.com/dedupeio/dedupe), or
64
+ [Record Linkage Toolkit](https://github.com/J535D165/recordlinkage),
65
+ had all of these properties, so I decided to make my own.
66
+
67
+ See [Goals and Alternatives](https://nickcrews.github.io/mismo/concepts/goals_and_alternatives)
68
+ for a more detailed discussion of the goals of Mismo and how it compares to other
69
+ record linkage packages.
70
+
71
+ ## Features
72
+ - Supports larger-than-memory datasets, executed on powerful SQL engines.
73
+ Use DuckDB for prototyping and for jobs up to maybe ~10M records,
74
+ or Spark or other distributed backends for larger tasks, without
75
+ needing to change your code!
76
+ - Use the clean, strong-typed, pythonic, Dataframe APIs of [Ibis](https://ibis-project.org/).
77
+ - Small, modular functions and data structures that are easy to plug together
78
+ and extend.
79
+ - Layered API: Use top-level APIs if your task is common enough that it is
80
+ supported out of the box.
81
+
82
+ ## Installation
83
+
84
+ [`mismo` is available on PyPI](https://pypi.org/project/mismo/).
85
+ I try to publish semver'ed releases after most changes.
86
+
87
+ If I forget to do this, then there are also[prereleases on PyPI](https://pypi.org/project/mismo/#history).
88
+ These are published every week by a github action using the HEAD commit of this repo.
89
+
90
+ You can also install directly from a branch or a specific commit from github:
91
+
92
+ ```console
93
+ uv pip install "mismo[viz] @ git+https://github.com/NickCrews/mismo@<SOME-SHA-OR-BRANCH>"
94
+ ```
95
+
96
+ ## Examples
97
+
98
+ See the [example notebook](https://nickcrews.github.io/mismo/examples/patent_deduplication).
99
+
100
+ ## Documentation
101
+
102
+ See the [documentation](https://nickcrews.github.io/mismo).
103
+
104
+ ## Contributing
105
+
106
+ See the [contributing guide](https://nickcrews.github.io/mismo/contributing/).
107
+
108
+ ## License
109
+
110
+ `mismo` is distributed under the terms of the
111
+ [LGPL-3.0-or-later](https://spdx.org/licenses/LGPL-3.0-or-later.html) license.
mismo-0.3.0/README.md ADDED
@@ -0,0 +1,73 @@
1
+ # Mismo
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/mismo.svg)](https://pypi.org/project/mismo)
4
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mismo.svg)](https://pypi.org/project/mismo)
5
+
6
+ The SQL/Ibis powered sklearn of record linkage.
7
+
8
+ Still in alpha stage. Breaking changes will happen frequently
9
+ and with no warning. Once things are more stabilized I
10
+ will come up with a stability policy. Any suggestions as
11
+ to how you want the API to look like would be greatly appreciated.
12
+ I do use this in my work, so at least I do decent job of
13
+ ensuring correctness.
14
+
15
+ -----
16
+
17
+ ## Goals
18
+
19
+ Mismo tries to be the sklearn of record linkage, backed by the scalability
20
+ and power of SQL and [Ibis](https://ibis-project.org/). It is made of many small
21
+ data structures and functions, each with a well-defined and standard API
22
+ that allows them to be composed together and extended easily.
23
+ None of the other record linkage packages I have seen, such as
24
+ [Splink](https://github.com/moj-analytical-services/splink),
25
+ [Dedupe](https://www.github.com/dedupeio/dedupe), or
26
+ [Record Linkage Toolkit](https://github.com/J535D165/recordlinkage),
27
+ had all of these properties, so I decided to make my own.
28
+
29
+ See [Goals and Alternatives](https://nickcrews.github.io/mismo/concepts/goals_and_alternatives)
30
+ for a more detailed discussion of the goals of Mismo and how it compares to other
31
+ record linkage packages.
32
+
33
+ ## Features
34
+ - Supports larger-than-memory datasets, executed on powerful SQL engines.
35
+ Use DuckDB for prototyping and for jobs up to maybe ~10M records,
36
+ or Spark or other distributed backends for larger tasks, without
37
+ needing to change your code!
38
+ - Use the clean, strong-typed, pythonic, Dataframe APIs of [Ibis](https://ibis-project.org/).
39
+ - Small, modular functions and data structures that are easy to plug together
40
+ and extend.
41
+ - Layered API: Use top-level APIs if your task is common enough that it is
42
+ supported out of the box.
43
+
44
+ ## Installation
45
+
46
+ [`mismo` is available on PyPI](https://pypi.org/project/mismo/).
47
+ I try to publish semver'ed releases after most changes.
48
+
49
+ If I forget to do this, then there are also[prereleases on PyPI](https://pypi.org/project/mismo/#history).
50
+ These are published every week by a github action using the HEAD commit of this repo.
51
+
52
+ You can also install directly from a branch or a specific commit from github:
53
+
54
+ ```console
55
+ uv pip install "mismo[viz] @ git+https://github.com/NickCrews/mismo@<SOME-SHA-OR-BRANCH>"
56
+ ```
57
+
58
+ ## Examples
59
+
60
+ See the [example notebook](https://nickcrews.github.io/mismo/examples/patent_deduplication).
61
+
62
+ ## Documentation
63
+
64
+ See the [documentation](https://nickcrews.github.io/mismo).
65
+
66
+ ## Contributing
67
+
68
+ See the [contributing guide](https://nickcrews.github.io/mismo/contributing/).
69
+
70
+ ## License
71
+
72
+ `mismo` is distributed under the terms of the
73
+ [LGPL-3.0-or-later](https://spdx.org/licenses/LGPL-3.0-or-later.html) license.
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib.metadata
4
+ import warnings
5
+
6
+ from mismo import arrays as arrays
7
+ from mismo import cluster as cluster
8
+ from mismo import compare as compare
9
+ from mismo import eda as eda
10
+ from mismo import exceptions as exceptions
11
+ from mismo import fs as fs
12
+ from mismo import joins as joins
13
+ from mismo import lib as lib
14
+ from mismo import linkage as linkage
15
+ from mismo import linker as linker
16
+ from mismo import playdata as playdata
17
+ from mismo import sets as sets
18
+ from mismo import text as text
19
+ from mismo import tf as tf
20
+ from mismo import types as types
21
+ from mismo import vector as vector
22
+ from mismo._counts_table import CountsTable as CountsTable
23
+ from mismo._datasets import Datasets as Datasets
24
+ from mismo._explain import explain as explain
25
+ from mismo._n_naive import n_naive_comparisons as n_naive_comparisons
26
+ from mismo._recipe import PRecipe as PRecipe
27
+ from mismo.joins import HasJoinCondition as HasJoinCondition
28
+ from mismo.joins import IntoHasJoinCondition as IntoHasJoinCondition
29
+ from mismo.joins import join as join
30
+ from mismo.joins import join_condition as join_condition
31
+ from mismo.joins import left as left
32
+ from mismo.joins import right as right
33
+ from mismo.linkage import Linkage as Linkage
34
+ from mismo.linker import EmptyLinker as EmptyLinker
35
+ from mismo.linker import FullLinker as FullLinker
36
+ from mismo.linker import IDLinker as IDLinker
37
+ from mismo.linker import JoinLinker as JoinLinker
38
+ from mismo.linker import KeyLinker as KeyLinker
39
+ from mismo.linker import Linker as Linker
40
+ from mismo.linker import OrLinker as OrLinker
41
+ from mismo.linker import empty_linkage as empty_linkage
42
+ from mismo.linker import full_linkage as full_linkage
43
+ from mismo.types import Diff as Diff
44
+ from mismo.types import DiffStats as DiffStats
45
+ from mismo.types import LinkCountsTable as LinkCountsTable
46
+ from mismo.types import LinkedTable as LinkedTable
47
+ from mismo.types import LinksTable as LinksTable
48
+ from mismo.types import UnionTable as UnionTable
49
+ from mismo.types import Updates as Updates
50
+
51
+ try:
52
+ __version__ = importlib.metadata.version(__name__)
53
+ except importlib.metadata.PackageNotFoundError as e:
54
+ warnings.warn(f"Could not determine version of {__name__}\n{e!s}", stacklevel=2)
55
+ __version__ = "unknown"
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+
3
+ import ibis
4
+
5
+
6
+ def check_tables_and_links(
7
+ left: ibis.Table, right: ibis.Table, links: ibis.Table
8
+ ) -> None:
9
+ if "record_id" not in left.columns:
10
+ raise ValueError("column 'record_id' not in table")
11
+ if "record_id" not in right.columns:
12
+ raise ValueError("column 'record_id' not in other")
13
+ if "record_id_l" not in links.columns:
14
+ raise ValueError("column 'record_id_l' not in links")
15
+ if "record_id_r" not in links.columns:
16
+ raise ValueError("column 'record_id_r' not in links")
17
+ try:
18
+ left.record_id == links.record_id_l
19
+ except Exception:
20
+ raise ValueError(
21
+ f"left.record_id of type {left.record_id.type()} is not comparable with links.record_id_l of type {links.record_id_l.type()}" # noqa: E501
22
+ )
23
+ try:
24
+ right.record_id == links.record_id_r
25
+ except Exception:
26
+ raise ValueError(
27
+ f"right.record_id of type {right.record_id.type()} is not comparable with links.record_id_r of type {links.record_id_r.type()}" # noqa: E501
28
+ )
@@ -0,0 +1,137 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ from typing import TYPE_CHECKING, NamedTuple
5
+
6
+ import ibis
7
+ from ibis import _
8
+ from ibis.expr import types as ir
9
+
10
+ from mismo.types._wrapper import TableWrapper
11
+
12
+ if TYPE_CHECKING:
13
+ import altair as alt
14
+
15
+
16
+ class _HistSpec(NamedTuple):
17
+ n_title: str
18
+ chart_title: str
19
+ chart_subtitle: str
20
+
21
+
22
+ class CountsTable(TableWrapper):
23
+ """A table with at least an Integer column named `n`.
24
+
25
+ There will also be variable number of other columns that act as identifiers.
26
+
27
+ You won't create this directly, it will be returned to you
28
+ from eg [KeyLinker.key_counts_left][mismo.KeyLinker.key_counts_left],
29
+ [KeyLinker.key_counts_right][mismo.KeyLinker.key_counts_right],
30
+ or [KeyLinker.pair_counts][mismo.KeyLinker.pair_counts].
31
+ """
32
+
33
+ n: ir.IntegerColumn
34
+ """The column containing the count."""
35
+
36
+ # This MUST be set in subclasses
37
+ _HIST_SPEC: _HistSpec
38
+
39
+ @functools.cache
40
+ def n_total(self) -> int:
41
+ """n.sum().fill_null(0), just here for convenience."""
42
+ raw = self.n.sum().execute()
43
+ return int(raw) if raw is not None else 0
44
+
45
+ def chart(self) -> alt.Chart:
46
+ return _counts_chart(self, hist_spec=self._HIST_SPEC)
47
+
48
+
49
+ class KeyCountsTable(CountsTable):
50
+ _HIST_SPEC = _HistSpec(
51
+ "Number of Records", "Number of Records by Key", "{n_total:_} Total Records"
52
+ )
53
+
54
+
55
+ class PairCountsTable(CountsTable):
56
+ _HIST_SPEC = _HistSpec(
57
+ "Number of Pairs", "Number of Pairs by Key", "{n_total:_} Total Pairs"
58
+ )
59
+
60
+
61
+ def _counts_chart(counts: CountsTable, *, hist_spec: _HistSpec):
62
+ import altair as alt
63
+
64
+ n_total = counts.n_total()
65
+ key_cols = [c for c in counts.columns if c != "n"]
66
+ val = "(" + ibis.literal(", ").join(counts[c].cast(str) for c in key_cols) + ")"
67
+ key_and_n = counts.mutate(val.name("key"), "n")
68
+ key_and_n = key_and_n.filter(_.n > 0)
69
+ frac = key_and_n.n / n_total if n_total > 0 else 0
70
+ key_and_n = key_and_n.mutate(
71
+ frac=frac,
72
+ explanation=(
73
+ "Out of the "
74
+ + f"{n_total:_}, "
75
+ + key_and_n.n.cast(str)
76
+ + " ("
77
+ + (frac * 100).cast(int).cast(str)
78
+ + "%) had the key of "
79
+ + key_and_n.key.cast(str)
80
+ ),
81
+ )
82
+ n_keys_total = key_and_n.count().execute()
83
+ key_title = "(" + ", ".join(key_cols) + ")"
84
+ scrubber_selection = alt.selection_interval(encodings=["x"], empty=True)
85
+ width = 800
86
+ zoomin = (
87
+ alt.Chart(key_and_n, width=width)
88
+ .mark_bar()
89
+ .encode(
90
+ alt.X("key:O", title=key_title, sort="-y"),
91
+ alt.Y(
92
+ "n:Q",
93
+ title=hist_spec.n_title,
94
+ scale=alt.Scale(type="symlog"),
95
+ ),
96
+ tooltip=[
97
+ alt.Tooltip("n:Q", title=hist_spec.n_title, format=","),
98
+ alt.Tooltip("frac:Q", title="Fraction", format=".2%"),
99
+ *[alt.Tooltip(col) for col in key_cols],
100
+ alt.Tooltip("explanation", title="Explanation"),
101
+ ],
102
+ )
103
+ .transform_filter(scrubber_selection)
104
+ )
105
+ scrubber = (
106
+ alt.Chart(
107
+ key_and_n,
108
+ width=width,
109
+ height=50,
110
+ title=alt.Title(
111
+ text="<Drag to select>",
112
+ dy=30,
113
+ anchor="middle",
114
+ fontSize=12,
115
+ color="gray",
116
+ ),
117
+ )
118
+ .mark_area(interpolate="step-after")
119
+ .encode(
120
+ alt.X("key:O", sort="-y", axis=None),
121
+ alt.Y("n:Q", title=None, axis=None),
122
+ )
123
+ .add_params(scrubber_selection)
124
+ )
125
+ together = scrubber & zoomin
126
+ together = together.resolve_scale(color="independent")
127
+ together = together.properties(
128
+ title=alt.Title(
129
+ hist_spec.chart_title,
130
+ subtitle=[
131
+ hist_spec.chart_subtitle.format(n_total=n_total),
132
+ f"{n_keys_total:_} keys total",
133
+ ],
134
+ anchor="middle",
135
+ )
136
+ )
137
+ return together