mismo 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mismo-0.3.0/PKG-INFO +111 -0
- mismo-0.3.0/README.md +73 -0
- mismo-0.3.0/mismo/__init__.py +55 -0
- mismo-0.3.0/mismo/_common.py +28 -0
- mismo-0.3.0/mismo/_counts_table.py +137 -0
- mismo-0.3.0/mismo/_data/_datasets/febrl/dataset1.csv +1001 -0
- mismo-0.3.0/mismo/_data/_datasets/febrl/dataset2.csv +5001 -0
- mismo-0.3.0/mismo/_data/_datasets/febrl/dataset3.csv +5001 -0
- mismo-0.3.0/mismo/_data/_datasets/leipzig/affiliations.csv +2261 -0
- mismo-0.3.0/mismo/_data/_datasets/leipzig/make_affiliations.py +102 -0
- mismo-0.3.0/mismo/_data/_datasets/patstat/patents.csv +2380 -0
- mismo-0.3.0/mismo/_data/_datasets/rldata/RLdata10000.csv +10001 -0
- mismo-0.3.0/mismo/_data/_datasets/rldata/RLdata500.csv +501 -0
- mismo-0.3.0/mismo/_datasets.py +172 -0
- mismo-0.3.0/mismo/_explain.py +37 -0
- mismo-0.3.0/mismo/_factorizer.py +152 -0
- mismo-0.3.0/mismo/_funcs.py +78 -0
- mismo-0.3.0/mismo/_n_naive.py +41 -0
- mismo-0.3.0/mismo/_recipe.py +18 -0
- mismo-0.3.0/mismo/_registry.py +75 -0
- mismo-0.3.0/mismo/_resolve.py +275 -0
- mismo-0.3.0/mismo/_structs.py +46 -0
- mismo-0.3.0/mismo/_typing.py +24 -0
- mismo-0.3.0/mismo/_upset.py +241 -0
- mismo-0.3.0/mismo/_util.py +542 -0
- mismo-0.3.0/mismo/arrays/__init__.py +19 -0
- mismo-0.3.0/mismo/arrays/_array.py +202 -0
- mismo-0.3.0/mismo/arrays/_builtins.py +71 -0
- mismo-0.3.0/mismo/arrays/tests/test_array.py +203 -0
- mismo-0.3.0/mismo/cluster/__init__.py +25 -0
- mismo-0.3.0/mismo/cluster/_connected_components.py +314 -0
- mismo-0.3.0/mismo/cluster/_dashboard.py +55 -0
- mismo-0.3.0/mismo/cluster/_dashboard_internal.py +238 -0
- mismo-0.3.0/mismo/cluster/_eval.py +180 -0
- mismo-0.3.0/mismo/cluster/_metrics.py +69 -0
- mismo-0.3.0/mismo/cluster/_subgraph.py +25 -0
- mismo-0.3.0/mismo/cluster/_subgraph_internal.py +42 -0
- mismo-0.3.0/mismo/cluster/test/test_connected_components.py +159 -0
- mismo-0.3.0/mismo/cluster/test/test_eval.py +62 -0
- mismo-0.3.0/mismo/compare/__init__.py +11 -0
- mismo-0.3.0/mismo/compare/_comparer.py +22 -0
- mismo-0.3.0/mismo/compare/_match_level.py +388 -0
- mismo-0.3.0/mismo/compare/_plot.py +334 -0
- mismo-0.3.0/mismo/compare/tests/test_match_level.py +211 -0
- mismo-0.3.0/mismo/conftest.py +198 -0
- mismo-0.3.0/mismo/eda/__init__.py +6 -0
- mismo-0.3.0/mismo/eda/_plot.py +167 -0
- mismo-0.3.0/mismo/exceptions.py +43 -0
- mismo-0.3.0/mismo/fs/__init__.py +11 -0
- mismo-0.3.0/mismo/fs/_plot.py +167 -0
- mismo-0.3.0/mismo/fs/_train.py +280 -0
- mismo-0.3.0/mismo/fs/_train_em.py +93 -0
- mismo-0.3.0/mismo/fs/_util.py +60 -0
- mismo-0.3.0/mismo/fs/_weights.py +391 -0
- mismo-0.3.0/mismo/fs/tests/test_train.py +156 -0
- mismo-0.3.0/mismo/fs/tests/test_weights.py +167 -0
- mismo-0.3.0/mismo/joins/__init__.py +20 -0
- mismo-0.3.0/mismo/joins/_analyze.py +149 -0
- mismo-0.3.0/mismo/joins/_conditions.py +326 -0
- mismo-0.3.0/mismo/joins/_core.py +147 -0
- mismo-0.3.0/mismo/joins/tests/test_conditions.py +47 -0
- mismo-0.3.0/mismo/joins/tests/test_join.py +53 -0
- mismo-0.3.0/mismo/lib/__init__.py +8 -0
- mismo-0.3.0/mismo/lib/email/__init__.py +9 -0
- mismo-0.3.0/mismo/lib/email/_core.py +191 -0
- mismo-0.3.0/mismo/lib/email/tests/test_core.py +82 -0
- mismo-0.3.0/mismo/lib/geo/__init__.py +13 -0
- mismo-0.3.0/mismo/lib/geo/_address.py +275 -0
- mismo-0.3.0/mismo/lib/geo/_census.py +348 -0
- mismo-0.3.0/mismo/lib/geo/_latlon.py +301 -0
- mismo-0.3.0/mismo/lib/geo/_postal.py +172 -0
- mismo-0.3.0/mismo/lib/geo/_regex_parse.py +686 -0
- mismo-0.3.0/mismo/lib/geo/_spacy.py +272 -0
- mismo-0.3.0/mismo/lib/geo/tests/.gitignore +1 -0
- mismo-0.3.0/mismo/lib/geo/tests/test_address.py +200 -0
- mismo-0.3.0/mismo/lib/geo/tests/test_census.py +248 -0
- mismo-0.3.0/mismo/lib/geo/tests/test_latlon.py +91 -0
- mismo-0.3.0/mismo/lib/geo/tests/test_postal.py +177 -0
- mismo-0.3.0/mismo/lib/geo/tests/test_postal_benchmark.py +160 -0
- mismo-0.3.0/mismo/lib/geo/tests/test_re_parse.py +432 -0
- mismo-0.3.0/mismo/lib/geo/tests/test_spacy.py +57 -0
- mismo-0.3.0/mismo/lib/name/__init__.py +10 -0
- mismo-0.3.0/mismo/lib/name/_blocker.py +104 -0
- mismo-0.3.0/mismo/lib/name/_clean.py +63 -0
- mismo-0.3.0/mismo/lib/name/_compare.py +152 -0
- mismo-0.3.0/mismo/lib/name/_dimension.py +73 -0
- mismo-0.3.0/mismo/lib/name/_nicknames.py +125 -0
- mismo-0.3.0/mismo/lib/name/tests/conftest.py +76 -0
- mismo-0.3.0/mismo/lib/name/tests/test_name_blocker.py +33 -0
- mismo-0.3.0/mismo/lib/name/tests/test_name_dimension.py +36 -0
- mismo-0.3.0/mismo/lib/name/tests/test_nicknames.py +52 -0
- mismo-0.3.0/mismo/lib/phone/__init__.py +8 -0
- mismo-0.3.0/mismo/lib/phone/_core.py +180 -0
- mismo-0.3.0/mismo/lib/phone/tests/__init__.py +0 -0
- mismo-0.3.0/mismo/lib/phone/tests/test_core.py +79 -0
- mismo-0.3.0/mismo/linkage/__init__.py +11 -0
- mismo-0.3.0/mismo/linkage/_analyze.py +43 -0
- mismo-0.3.0/mismo/linkage/_combine.py +219 -0
- mismo-0.3.0/mismo/linkage/_dimension.py +37 -0
- mismo-0.3.0/mismo/linkage/_linkage.py +290 -0
- mismo-0.3.0/mismo/linkage/_sample.py +117 -0
- mismo-0.3.0/mismo/linkage/tests/test_key_blocker_benchmark.py +67 -0
- mismo-0.3.0/mismo/linkage/tests/test_sample.py +74 -0
- mismo-0.3.0/mismo/linkage/tests/test_slow_join.py +54 -0
- mismo-0.3.0/mismo/linker/__init__.py +12 -0
- mismo-0.3.0/mismo/linker/_basic.py +95 -0
- mismo-0.3.0/mismo/linker/_common.py +28 -0
- mismo-0.3.0/mismo/linker/_id_linker.py +126 -0
- mismo-0.3.0/mismo/linker/_join_linker.py +64 -0
- mismo-0.3.0/mismo/linker/_key_linker.py +471 -0
- mismo-0.3.0/mismo/linker/_lsh.py +193 -0
- mismo-0.3.0/mismo/linker/_or_linker.py +118 -0
- mismo-0.3.0/mismo/linker/tests/test_id_linker.py +123 -0
- mismo-0.3.0/mismo/linker/tests/test_join_linker.py +263 -0
- mismo-0.3.0/mismo/linker/tests/test_key_linker.py +136 -0
- mismo-0.3.0/mismo/linker/tests/test_key_linker_counts.py +129 -0
- mismo-0.3.0/mismo/linker/tests/test_linker.py +45 -0
- mismo-0.3.0/mismo/linker/tests/test_lsh.py +40 -0
- mismo-0.3.0/mismo/linker/tests/test_or_linker.py +242 -0
- mismo-0.3.0/mismo/playdata.py +317 -0
- mismo-0.3.0/mismo/sets/__init__.py +10 -0
- mismo-0.3.0/mismo/sets/_compare.py +23 -0
- mismo-0.3.0/mismo/sets/_tfidf.py +315 -0
- mismo-0.3.0/mismo/sets/tests/test_compare.py +27 -0
- mismo-0.3.0/mismo/sets/tests/test_tfidf.py +143 -0
- mismo-0.3.0/mismo/tests/__init__.py +0 -0
- mismo-0.3.0/mismo/tests/test_exceptions.py +45 -0
- mismo-0.3.0/mismo/tests/test_factorizer.py +106 -0
- mismo-0.3.0/mismo/tests/test_funcs.py +109 -0
- mismo-0.3.0/mismo/tests/test_n_naive.py +42 -0
- mismo-0.3.0/mismo/tests/test_playdata.py +87 -0
- mismo-0.3.0/mismo/tests/test_resolve.py +108 -0
- mismo-0.3.0/mismo/tests/test_util.py +138 -0
- mismo-0.3.0/mismo/tests/test_version.py +7 -0
- mismo-0.3.0/mismo/tests/util.py +100 -0
- mismo-0.3.0/mismo/text/__init__.py +17 -0
- mismo-0.3.0/mismo/text/_features.py +78 -0
- mismo-0.3.0/mismo/text/_re_extract.py +72 -0
- mismo-0.3.0/mismo/text/_similarity.py +181 -0
- mismo-0.3.0/mismo/text/_strings.py +58 -0
- mismo-0.3.0/mismo/text/tests/test_features.py +80 -0
- mismo-0.3.0/mismo/text/tests/test_re_extract.py +26 -0
- mismo-0.3.0/mismo/text/tests/test_similarity.py +80 -0
- mismo-0.3.0/mismo/text/tests/test_strings.py +24 -0
- mismo-0.3.0/mismo/tf/__init__.py +11 -0
- mismo-0.3.0/mismo/tf/_filterer.py +104 -0
- mismo-0.3.0/mismo/tf/_tf.py +154 -0
- mismo-0.3.0/mismo/types/__init__.py +9 -0
- mismo-0.3.0/mismo/types/_diff.py +454 -0
- mismo-0.3.0/mismo/types/_linked_table.py +467 -0
- mismo-0.3.0/mismo/types/_links_table.py +223 -0
- mismo-0.3.0/mismo/types/_union_table.py +51 -0
- mismo-0.3.0/mismo/types/_updates.py +378 -0
- mismo-0.3.0/mismo/types/_wrapper.py +50 -0
- mismo-0.3.0/mismo/types/tests/test_diff.py +137 -0
- mismo-0.3.0/mismo/types/tests/test_linkage.py +121 -0
- mismo-0.3.0/mismo/types/tests/test_union.py +186 -0
- mismo-0.3.0/mismo/types/tests/test_updates.py +74 -0
- mismo-0.3.0/mismo/types/tests/test_wrapper.py +69 -0
- mismo-0.3.0/mismo/vector/__init__.py +9 -0
- mismo-0.3.0/mismo/vector/_vector.py +270 -0
- mismo-0.3.0/mismo/vector/tests/test_vector.py +269 -0
- mismo-0.3.0/pyproject.toml +196 -0
mismo-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: mismo
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: The SQL/Ibis powered sklearn of record linkage.
|
|
5
|
+
Keywords: record linkage,entity resolution,fuzzy linking,machine learning,ibis,sql,splink,duckdb
|
|
6
|
+
Author: Nick Crews
|
|
7
|
+
Author-email: Nick Crews <nicholas.b.crews@gmail.com>
|
|
8
|
+
License: LGPL-3.0-or-later
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Programming Language :: Python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
16
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
17
|
+
Requires-Dist: ibis-framework>=9.1.0
|
|
18
|
+
Requires-Dist: sqlglot>=25.29.0
|
|
19
|
+
Requires-Dist: typing-extensions>=4.0.0 ; python_full_version < '3.11'
|
|
20
|
+
Requires-Dist: scikit-learn>=1.5.2 ; extra == 'metrics'
|
|
21
|
+
Requires-Dist: postal>=1.1.7,<1.1.11 ; (sys_platform == 'darwin' and extra == 'postal') or (sys_platform == 'linux' and extra == 'postal')
|
|
22
|
+
Requires-Dist: en-us-address-ner-sm ; extra == 'spacy'
|
|
23
|
+
Requires-Dist: spacy>=3.8.2 ; extra == 'spacy'
|
|
24
|
+
Requires-Dist: altair>=5.0.0 ; extra == 'viz'
|
|
25
|
+
Requires-Dist: ipywidgets>=7.5.1 ; extra == 'viz'
|
|
26
|
+
Requires-Dist: solara-ui>=1.51.0 ; extra == 'viz'
|
|
27
|
+
Requires-Dist: anywidget>=0.9.18 ; extra == 'viz'
|
|
28
|
+
Requires-Python: >=3.10
|
|
29
|
+
Project-URL: Documentation, https://nickcrews.github.io/mismo
|
|
30
|
+
Project-URL: Homepage, https://github.com/NickCrews/mismo
|
|
31
|
+
Project-URL: Issues, https://github.com/NickCrews/mismo/issues
|
|
32
|
+
Project-URL: Source, https://github.com/NickCrews/mismo
|
|
33
|
+
Provides-Extra: metrics
|
|
34
|
+
Provides-Extra: postal
|
|
35
|
+
Provides-Extra: spacy
|
|
36
|
+
Provides-Extra: viz
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
|
|
39
|
+
# Mismo
|
|
40
|
+
|
|
41
|
+
[](https://pypi.org/project/mismo)
|
|
42
|
+
[](https://pypi.org/project/mismo)
|
|
43
|
+
|
|
44
|
+
The SQL/Ibis powered sklearn of record linkage.
|
|
45
|
+
|
|
46
|
+
Still in alpha stage. Breaking changes will happen frequently
|
|
47
|
+
and with no warning. Once things are more stabilized I
|
|
48
|
+
will come up with a stability policy. Any suggestions as
|
|
49
|
+
to how you want the API to look like would be greatly appreciated.
|
|
50
|
+
I do use this in my work, so at least I do decent job of
|
|
51
|
+
ensuring correctness.
|
|
52
|
+
|
|
53
|
+
-----
|
|
54
|
+
|
|
55
|
+
## Goals
|
|
56
|
+
|
|
57
|
+
Mismo tries to be the sklearn of record linkage, backed by the scalability
|
|
58
|
+
and power of SQL and [Ibis](https://ibis-project.org/). It is made of many small
|
|
59
|
+
data structures and functions, each with a well-defined and standard API
|
|
60
|
+
that allows them to be composed together and extended easily.
|
|
61
|
+
None of the other record linkage packages I have seen, such as
|
|
62
|
+
[Splink](https://github.com/moj-analytical-services/splink),
|
|
63
|
+
[Dedupe](https://www.github.com/dedupeio/dedupe), or
|
|
64
|
+
[Record Linkage Toolkit](https://github.com/J535D165/recordlinkage),
|
|
65
|
+
had all of these properties, so I decided to make my own.
|
|
66
|
+
|
|
67
|
+
See [Goals and Alternatives](https://nickcrews.github.io/mismo/concepts/goals_and_alternatives)
|
|
68
|
+
for a more detailed discussion of the goals of Mismo and how it compares to other
|
|
69
|
+
record linkage packages.
|
|
70
|
+
|
|
71
|
+
## Features
|
|
72
|
+
- Supports larger-than-memory datasets, executed on powerful SQL engines.
|
|
73
|
+
Use DuckDB for prototyping and for jobs up to maybe ~10M records,
|
|
74
|
+
or Spark or other distributed backends for larger tasks, without
|
|
75
|
+
needing to change your code!
|
|
76
|
+
- Use the clean, strong-typed, pythonic, Dataframe APIs of [Ibis](https://ibis-project.org/).
|
|
77
|
+
- Small, modular functions and data structures that are easy to plug together
|
|
78
|
+
and extend.
|
|
79
|
+
- Layered API: Use top-level APIs if your task is common enough that it is
|
|
80
|
+
supported out of the box.
|
|
81
|
+
|
|
82
|
+
## Installation
|
|
83
|
+
|
|
84
|
+
[`mismo` is available on PyPI](https://pypi.org/project/mismo/).
|
|
85
|
+
I try to publish semver'ed releases after most changes.
|
|
86
|
+
|
|
87
|
+
If I forget to do this, then there are also[prereleases on PyPI](https://pypi.org/project/mismo/#history).
|
|
88
|
+
These are published every week by a github action using the HEAD commit of this repo.
|
|
89
|
+
|
|
90
|
+
You can also install directly from a branch or a specific commit from github:
|
|
91
|
+
|
|
92
|
+
```console
|
|
93
|
+
uv pip install "mismo[viz] @ git+https://github.com/NickCrews/mismo@<SOME-SHA-OR-BRANCH>"
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Examples
|
|
97
|
+
|
|
98
|
+
See the [example notebook](https://nickcrews.github.io/mismo/examples/patent_deduplication).
|
|
99
|
+
|
|
100
|
+
## Documentation
|
|
101
|
+
|
|
102
|
+
See the [documentation](https://nickcrews.github.io/mismo).
|
|
103
|
+
|
|
104
|
+
## Contributing
|
|
105
|
+
|
|
106
|
+
See the [contributing guide](https://nickcrews.github.io/mismo/contributing/).
|
|
107
|
+
|
|
108
|
+
## License
|
|
109
|
+
|
|
110
|
+
`mismo` is distributed under the terms of the
|
|
111
|
+
[LGPL-3.0-or-later](https://spdx.org/licenses/LGPL-3.0-or-later.html) license.
|
mismo-0.3.0/README.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Mismo
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/mismo)
|
|
4
|
+
[](https://pypi.org/project/mismo)
|
|
5
|
+
|
|
6
|
+
The SQL/Ibis powered sklearn of record linkage.
|
|
7
|
+
|
|
8
|
+
Still in alpha stage. Breaking changes will happen frequently
|
|
9
|
+
and with no warning. Once things are more stabilized I
|
|
10
|
+
will come up with a stability policy. Any suggestions as
|
|
11
|
+
to how you want the API to look like would be greatly appreciated.
|
|
12
|
+
I do use this in my work, so at least I do decent job of
|
|
13
|
+
ensuring correctness.
|
|
14
|
+
|
|
15
|
+
-----
|
|
16
|
+
|
|
17
|
+
## Goals
|
|
18
|
+
|
|
19
|
+
Mismo tries to be the sklearn of record linkage, backed by the scalability
|
|
20
|
+
and power of SQL and [Ibis](https://ibis-project.org/). It is made of many small
|
|
21
|
+
data structures and functions, each with a well-defined and standard API
|
|
22
|
+
that allows them to be composed together and extended easily.
|
|
23
|
+
None of the other record linkage packages I have seen, such as
|
|
24
|
+
[Splink](https://github.com/moj-analytical-services/splink),
|
|
25
|
+
[Dedupe](https://www.github.com/dedupeio/dedupe), or
|
|
26
|
+
[Record Linkage Toolkit](https://github.com/J535D165/recordlinkage),
|
|
27
|
+
had all of these properties, so I decided to make my own.
|
|
28
|
+
|
|
29
|
+
See [Goals and Alternatives](https://nickcrews.github.io/mismo/concepts/goals_and_alternatives)
|
|
30
|
+
for a more detailed discussion of the goals of Mismo and how it compares to other
|
|
31
|
+
record linkage packages.
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
- Supports larger-than-memory datasets, executed on powerful SQL engines.
|
|
35
|
+
Use DuckDB for prototyping and for jobs up to maybe ~10M records,
|
|
36
|
+
or Spark or other distributed backends for larger tasks, without
|
|
37
|
+
needing to change your code!
|
|
38
|
+
- Use the clean, strong-typed, pythonic, Dataframe APIs of [Ibis](https://ibis-project.org/).
|
|
39
|
+
- Small, modular functions and data structures that are easy to plug together
|
|
40
|
+
and extend.
|
|
41
|
+
- Layered API: Use top-level APIs if your task is common enough that it is
|
|
42
|
+
supported out of the box.
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
[`mismo` is available on PyPI](https://pypi.org/project/mismo/).
|
|
47
|
+
I try to publish semver'ed releases after most changes.
|
|
48
|
+
|
|
49
|
+
If I forget to do this, then there are also[prereleases on PyPI](https://pypi.org/project/mismo/#history).
|
|
50
|
+
These are published every week by a github action using the HEAD commit of this repo.
|
|
51
|
+
|
|
52
|
+
You can also install directly from a branch or a specific commit from github:
|
|
53
|
+
|
|
54
|
+
```console
|
|
55
|
+
uv pip install "mismo[viz] @ git+https://github.com/NickCrews/mismo@<SOME-SHA-OR-BRANCH>"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Examples
|
|
59
|
+
|
|
60
|
+
See the [example notebook](https://nickcrews.github.io/mismo/examples/patent_deduplication).
|
|
61
|
+
|
|
62
|
+
## Documentation
|
|
63
|
+
|
|
64
|
+
See the [documentation](https://nickcrews.github.io/mismo).
|
|
65
|
+
|
|
66
|
+
## Contributing
|
|
67
|
+
|
|
68
|
+
See the [contributing guide](https://nickcrews.github.io/mismo/contributing/).
|
|
69
|
+
|
|
70
|
+
## License
|
|
71
|
+
|
|
72
|
+
`mismo` is distributed under the terms of the
|
|
73
|
+
[LGPL-3.0-or-later](https://spdx.org/licenses/LGPL-3.0-or-later.html) license.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib.metadata
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
from mismo import arrays as arrays
|
|
7
|
+
from mismo import cluster as cluster
|
|
8
|
+
from mismo import compare as compare
|
|
9
|
+
from mismo import eda as eda
|
|
10
|
+
from mismo import exceptions as exceptions
|
|
11
|
+
from mismo import fs as fs
|
|
12
|
+
from mismo import joins as joins
|
|
13
|
+
from mismo import lib as lib
|
|
14
|
+
from mismo import linkage as linkage
|
|
15
|
+
from mismo import linker as linker
|
|
16
|
+
from mismo import playdata as playdata
|
|
17
|
+
from mismo import sets as sets
|
|
18
|
+
from mismo import text as text
|
|
19
|
+
from mismo import tf as tf
|
|
20
|
+
from mismo import types as types
|
|
21
|
+
from mismo import vector as vector
|
|
22
|
+
from mismo._counts_table import CountsTable as CountsTable
|
|
23
|
+
from mismo._datasets import Datasets as Datasets
|
|
24
|
+
from mismo._explain import explain as explain
|
|
25
|
+
from mismo._n_naive import n_naive_comparisons as n_naive_comparisons
|
|
26
|
+
from mismo._recipe import PRecipe as PRecipe
|
|
27
|
+
from mismo.joins import HasJoinCondition as HasJoinCondition
|
|
28
|
+
from mismo.joins import IntoHasJoinCondition as IntoHasJoinCondition
|
|
29
|
+
from mismo.joins import join as join
|
|
30
|
+
from mismo.joins import join_condition as join_condition
|
|
31
|
+
from mismo.joins import left as left
|
|
32
|
+
from mismo.joins import right as right
|
|
33
|
+
from mismo.linkage import Linkage as Linkage
|
|
34
|
+
from mismo.linker import EmptyLinker as EmptyLinker
|
|
35
|
+
from mismo.linker import FullLinker as FullLinker
|
|
36
|
+
from mismo.linker import IDLinker as IDLinker
|
|
37
|
+
from mismo.linker import JoinLinker as JoinLinker
|
|
38
|
+
from mismo.linker import KeyLinker as KeyLinker
|
|
39
|
+
from mismo.linker import Linker as Linker
|
|
40
|
+
from mismo.linker import OrLinker as OrLinker
|
|
41
|
+
from mismo.linker import empty_linkage as empty_linkage
|
|
42
|
+
from mismo.linker import full_linkage as full_linkage
|
|
43
|
+
from mismo.types import Diff as Diff
|
|
44
|
+
from mismo.types import DiffStats as DiffStats
|
|
45
|
+
from mismo.types import LinkCountsTable as LinkCountsTable
|
|
46
|
+
from mismo.types import LinkedTable as LinkedTable
|
|
47
|
+
from mismo.types import LinksTable as LinksTable
|
|
48
|
+
from mismo.types import UnionTable as UnionTable
|
|
49
|
+
from mismo.types import Updates as Updates
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
__version__ = importlib.metadata.version(__name__)
|
|
53
|
+
except importlib.metadata.PackageNotFoundError as e:
|
|
54
|
+
warnings.warn(f"Could not determine version of {__name__}\n{e!s}", stacklevel=2)
|
|
55
|
+
__version__ = "unknown"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ibis
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def check_tables_and_links(
|
|
7
|
+
left: ibis.Table, right: ibis.Table, links: ibis.Table
|
|
8
|
+
) -> None:
|
|
9
|
+
if "record_id" not in left.columns:
|
|
10
|
+
raise ValueError("column 'record_id' not in table")
|
|
11
|
+
if "record_id" not in right.columns:
|
|
12
|
+
raise ValueError("column 'record_id' not in other")
|
|
13
|
+
if "record_id_l" not in links.columns:
|
|
14
|
+
raise ValueError("column 'record_id_l' not in links")
|
|
15
|
+
if "record_id_r" not in links.columns:
|
|
16
|
+
raise ValueError("column 'record_id_r' not in links")
|
|
17
|
+
try:
|
|
18
|
+
left.record_id == links.record_id_l
|
|
19
|
+
except Exception:
|
|
20
|
+
raise ValueError(
|
|
21
|
+
f"left.record_id of type {left.record_id.type()} is not comparable with links.record_id_l of type {links.record_id_l.type()}" # noqa: E501
|
|
22
|
+
)
|
|
23
|
+
try:
|
|
24
|
+
right.record_id == links.record_id_r
|
|
25
|
+
except Exception:
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"right.record_id of type {right.record_id.type()} is not comparable with links.record_id_r of type {links.record_id_r.type()}" # noqa: E501
|
|
28
|
+
)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
from typing import TYPE_CHECKING, NamedTuple
|
|
5
|
+
|
|
6
|
+
import ibis
|
|
7
|
+
from ibis import _
|
|
8
|
+
from ibis.expr import types as ir
|
|
9
|
+
|
|
10
|
+
from mismo.types._wrapper import TableWrapper
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
import altair as alt
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _HistSpec(NamedTuple):
|
|
17
|
+
n_title: str
|
|
18
|
+
chart_title: str
|
|
19
|
+
chart_subtitle: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CountsTable(TableWrapper):
|
|
23
|
+
"""A table with at least an Integer column named `n`.
|
|
24
|
+
|
|
25
|
+
There will also be variable number of other columns that act as identifiers.
|
|
26
|
+
|
|
27
|
+
You won't create this directly, it will be returned to you
|
|
28
|
+
from eg [KeyLinker.key_counts_left][mismo.KeyLinker.key_counts_left],
|
|
29
|
+
[KeyLinker.key_counts_right][mismo.KeyLinker.key_counts_right],
|
|
30
|
+
or [KeyLinker.pair_counts][mismo.KeyLinker.pair_counts].
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
n: ir.IntegerColumn
|
|
34
|
+
"""The column containing the count."""
|
|
35
|
+
|
|
36
|
+
# This MUST be set in subclasses
|
|
37
|
+
_HIST_SPEC: _HistSpec
|
|
38
|
+
|
|
39
|
+
@functools.cache
|
|
40
|
+
def n_total(self) -> int:
|
|
41
|
+
"""n.sum().fill_null(0), just here for convenience."""
|
|
42
|
+
raw = self.n.sum().execute()
|
|
43
|
+
return int(raw) if raw is not None else 0
|
|
44
|
+
|
|
45
|
+
def chart(self) -> alt.Chart:
|
|
46
|
+
return _counts_chart(self, hist_spec=self._HIST_SPEC)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class KeyCountsTable(CountsTable):
|
|
50
|
+
_HIST_SPEC = _HistSpec(
|
|
51
|
+
"Number of Records", "Number of Records by Key", "{n_total:_} Total Records"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class PairCountsTable(CountsTable):
|
|
56
|
+
_HIST_SPEC = _HistSpec(
|
|
57
|
+
"Number of Pairs", "Number of Pairs by Key", "{n_total:_} Total Pairs"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _counts_chart(counts: CountsTable, *, hist_spec: _HistSpec):
|
|
62
|
+
import altair as alt
|
|
63
|
+
|
|
64
|
+
n_total = counts.n_total()
|
|
65
|
+
key_cols = [c for c in counts.columns if c != "n"]
|
|
66
|
+
val = "(" + ibis.literal(", ").join(counts[c].cast(str) for c in key_cols) + ")"
|
|
67
|
+
key_and_n = counts.mutate(val.name("key"), "n")
|
|
68
|
+
key_and_n = key_and_n.filter(_.n > 0)
|
|
69
|
+
frac = key_and_n.n / n_total if n_total > 0 else 0
|
|
70
|
+
key_and_n = key_and_n.mutate(
|
|
71
|
+
frac=frac,
|
|
72
|
+
explanation=(
|
|
73
|
+
"Out of the "
|
|
74
|
+
+ f"{n_total:_}, "
|
|
75
|
+
+ key_and_n.n.cast(str)
|
|
76
|
+
+ " ("
|
|
77
|
+
+ (frac * 100).cast(int).cast(str)
|
|
78
|
+
+ "%) had the key of "
|
|
79
|
+
+ key_and_n.key.cast(str)
|
|
80
|
+
),
|
|
81
|
+
)
|
|
82
|
+
n_keys_total = key_and_n.count().execute()
|
|
83
|
+
key_title = "(" + ", ".join(key_cols) + ")"
|
|
84
|
+
scrubber_selection = alt.selection_interval(encodings=["x"], empty=True)
|
|
85
|
+
width = 800
|
|
86
|
+
zoomin = (
|
|
87
|
+
alt.Chart(key_and_n, width=width)
|
|
88
|
+
.mark_bar()
|
|
89
|
+
.encode(
|
|
90
|
+
alt.X("key:O", title=key_title, sort="-y"),
|
|
91
|
+
alt.Y(
|
|
92
|
+
"n:Q",
|
|
93
|
+
title=hist_spec.n_title,
|
|
94
|
+
scale=alt.Scale(type="symlog"),
|
|
95
|
+
),
|
|
96
|
+
tooltip=[
|
|
97
|
+
alt.Tooltip("n:Q", title=hist_spec.n_title, format=","),
|
|
98
|
+
alt.Tooltip("frac:Q", title="Fraction", format=".2%"),
|
|
99
|
+
*[alt.Tooltip(col) for col in key_cols],
|
|
100
|
+
alt.Tooltip("explanation", title="Explanation"),
|
|
101
|
+
],
|
|
102
|
+
)
|
|
103
|
+
.transform_filter(scrubber_selection)
|
|
104
|
+
)
|
|
105
|
+
scrubber = (
|
|
106
|
+
alt.Chart(
|
|
107
|
+
key_and_n,
|
|
108
|
+
width=width,
|
|
109
|
+
height=50,
|
|
110
|
+
title=alt.Title(
|
|
111
|
+
text="<Drag to select>",
|
|
112
|
+
dy=30,
|
|
113
|
+
anchor="middle",
|
|
114
|
+
fontSize=12,
|
|
115
|
+
color="gray",
|
|
116
|
+
),
|
|
117
|
+
)
|
|
118
|
+
.mark_area(interpolate="step-after")
|
|
119
|
+
.encode(
|
|
120
|
+
alt.X("key:O", sort="-y", axis=None),
|
|
121
|
+
alt.Y("n:Q", title=None, axis=None),
|
|
122
|
+
)
|
|
123
|
+
.add_params(scrubber_selection)
|
|
124
|
+
)
|
|
125
|
+
together = scrubber & zoomin
|
|
126
|
+
together = together.resolve_scale(color="independent")
|
|
127
|
+
together = together.properties(
|
|
128
|
+
title=alt.Title(
|
|
129
|
+
hist_spec.chart_title,
|
|
130
|
+
subtitle=[
|
|
131
|
+
hist_spec.chart_subtitle.format(n_total=n_total),
|
|
132
|
+
f"{n_keys_total:_} keys total",
|
|
133
|
+
],
|
|
134
|
+
anchor="middle",
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
return together
|