mismo 0.2.7.dev1__tar.gz → 0.2.7.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/PKG-INFO +1 -1
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_datasets.py +3 -1
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_resolve.py +2 -9
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_util.py +33 -7
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/compare/_match_level.py +4 -3
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/_address.py +1 -1
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tf/_tf.py +2 -2
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/_linked_table.py +6 -2
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/_updates.py +2 -2
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/pyproject.toml +1 -1
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/README.md +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_common.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_counts_table.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_data/_datasets/febrl/dataset1.csv +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_data/_datasets/febrl/dataset2.csv +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_data/_datasets/febrl/dataset3.csv +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_data/_datasets/leipzig/affiliations.csv +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_data/_datasets/leipzig/make_affiliations.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_data/_datasets/patstat/patents.csv +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_data/_datasets/rldata/RLdata10000.csv +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_data/_datasets/rldata/RLdata500.csv +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_explain.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_factorizer.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_funcs.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_n_naive.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_recipe.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_registry.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_structs.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_typing.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/_upset.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/arrays/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/arrays/_array.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/arrays/_builtins.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/arrays/tests/test_array.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/_connected_components.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/_dashboard.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/_dashboard_internal.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/_eval.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/_metrics.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/_subgraph.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/_subgraph_internal.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/test/test_connected_components.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/cluster/test/test_eval.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/compare/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/compare/_comparer.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/compare/_plot.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/compare/tests/test_match_level.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/conftest.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/eda/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/eda/_plot.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/exceptions.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/fs/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/fs/_plot.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/fs/_train.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/fs/_train_em.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/fs/_util.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/fs/_weights.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/fs/tests/test_train.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/fs/tests/test_weights.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/joins/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/joins/_analyze.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/joins/_conditions.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/joins/_core.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/joins/tests/test_conditions.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/joins/tests/test_join.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/email/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/email/_core.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/email/tests/test_core.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/_census.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/_latlon.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/_postal.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/_regex_parse.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/_spacy.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/tests/.gitignore +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/tests/test_address.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/tests/test_census.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/tests/test_latlon.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/tests/test_postal.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/tests/test_postal_benchmark.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/tests/test_re_parse.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/geo/tests/test_spacy.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/_blocker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/_clean.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/_compare.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/_dimension.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/_nicknames.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/tests/conftest.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/tests/test_name_blocker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/tests/test_name_dimension.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/name/tests/test_nicknames.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/phone/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/phone/_core.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/phone/tests/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/lib/phone/tests/test_core.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linkage/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linkage/_analyze.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linkage/_combine.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linkage/_dimension.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linkage/_linkage.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linkage/_sample.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linkage/tests/test_key_blocker_benchmark.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linkage/tests/test_sample.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linkage/tests/test_slow_join.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/_basic.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/_common.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/_id_linker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/_join_linker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/_key_linker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/_lsh.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/_or_linker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/tests/test_id_linker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/tests/test_join_linker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/tests/test_key_linker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/tests/test_key_linker_counts.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/tests/test_linker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/tests/test_lsh.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/linker/tests/test_or_linker.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/playdata.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/sets/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/sets/_compare.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/sets/_tfidf.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/sets/tests/test_compare.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/sets/tests/test_tfidf.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/test_exceptions.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/test_factorizer.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/test_funcs.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/test_n_naive.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/test_playdata.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/test_resolve.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/test_util.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/test_version.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tests/util.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/text/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/text/_features.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/text/_re_extract.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/text/_similarity.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/text/_strings.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/text/tests/test_features.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/text/tests/test_re_extract.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/text/tests/test_similarity.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/text/tests/test_strings.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tf/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/tf/_filterer.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/_diff.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/_links_table.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/_table_wrapper.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/_union_table.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/tests/test_diff.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/tests/test_linkage.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/tests/test_union.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/types/tests/test_updates.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/vector/__init__.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/vector/_vector.py +0 -0
- {mismo-0.2.7.dev1 → mismo-0.2.7.dev3}/mismo/vector/tests/test_vector.py +0 -0
|
@@ -5,6 +5,8 @@ from typing import Callable, Iterable, Mapping, Sequence
|
|
|
5
5
|
import ibis
|
|
6
6
|
from ibis.expr import types as ir
|
|
7
7
|
|
|
8
|
+
from mismo import _util
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
class Datasets:
|
|
10
12
|
"""An ordered, dict-like collection of tables of records.
|
|
@@ -73,7 +75,7 @@ class Datasets:
|
|
|
73
75
|
def map(self, f: ibis.Deferred | Callable[[str, ir.Table], ir.Table]) -> ir.Table:
|
|
74
76
|
"""Return a new Datasets with all tables transformed by `f`."""
|
|
75
77
|
if isinstance(f, ibis.Deferred):
|
|
76
|
-
return self.__class__({name:
|
|
78
|
+
return self.__class__({name: _util.bind(f, t) for name, t in self.items()})
|
|
77
79
|
else:
|
|
78
80
|
return self.__class__({name: f(name, t) for name, t in self.items()})
|
|
79
81
|
|
|
@@ -32,7 +32,7 @@ class DeferredResolver(ValueResolver):
|
|
|
32
32
|
|
|
33
33
|
def __call__(self, t: ibis.Table) -> ibis.Column:
|
|
34
34
|
raw = self.deferred.resolve(**{self.name: t})
|
|
35
|
-
return
|
|
35
|
+
return _util.bind_one(t, raw)
|
|
36
36
|
|
|
37
37
|
def __repr__(self) -> str:
|
|
38
38
|
if self.name == "_":
|
|
@@ -49,7 +49,7 @@ class LiteralResolver(ValueResolver):
|
|
|
49
49
|
|
|
50
50
|
def __call__(self, t: ibis.Table) -> ibis.Column:
|
|
51
51
|
"""Resolve a literal value."""
|
|
52
|
-
resolved =
|
|
52
|
+
resolved = _util.bind(t, self.value)
|
|
53
53
|
if len(resolved) != 1:
|
|
54
54
|
raise ValueError(
|
|
55
55
|
f"Expected 1 column, got {len(resolved)} from {self.value}"
|
|
@@ -95,13 +95,6 @@ class FuncResolver(ValueResolver):
|
|
|
95
95
|
return f"FuncResolver({self.func!r})"
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
def _resolve(t: ir.Table, spec) -> ir.Value:
|
|
99
|
-
values = t.bind(spec)
|
|
100
|
-
if len(values) != 1:
|
|
101
|
-
raise ValueError(f"Expected 1 column, got {len(values)} from {spec}")
|
|
102
|
-
return values[0]
|
|
103
|
-
|
|
104
|
-
|
|
105
98
|
def value_resolver(spec: ibis.Value | Deferred | str) -> ValueResolver:
|
|
106
99
|
"""
|
|
107
100
|
Given a spec, return a ValueResolver that resolves to a single column.
|
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import base64
|
|
4
4
|
from collections.abc import Sequence
|
|
5
5
|
from contextlib import contextmanager
|
|
6
|
-
from typing import Any, Callable, Iterable, Literal, Mapping, TypeVar
|
|
6
|
+
from typing import Any, Callable, Iterable, Literal, Mapping, TypeVar, overload
|
|
7
7
|
import uuid
|
|
8
8
|
import warnings
|
|
9
9
|
|
|
@@ -89,12 +89,38 @@ def cases(
|
|
|
89
89
|
return builder.else_(else_).end()
|
|
90
90
|
|
|
91
91
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
92
|
+
@overload
|
|
93
|
+
def bind(t: ibis.Table, ref: Any, /) -> tuple[ibis.Column, ...]: ...
|
|
94
|
+
@overload
|
|
95
|
+
def bind(t: ibis.Deferred, ref: Any, /) -> tuple[ibis.Deferred]: ...
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def bind(t: ibis.Deferred | ibis.Table, ref: Any) -> tuple[ibis.Column, ...]:
|
|
99
|
+
"""Reference into a table to get Columns and Scalars.
|
|
100
|
+
|
|
101
|
+
ibis._.bind(ref) does not work because it returns another Deferred.
|
|
102
|
+
|
|
103
|
+
Also, per https://github.com/ibis-project/ibis/pull/11746,
|
|
104
|
+
in some versions of ibis, .bind() returns a generator, not a tuple,
|
|
105
|
+
so this function always returns a tuple.
|
|
106
|
+
"""
|
|
107
|
+
if isinstance(t, ibis.Table):
|
|
108
|
+
return tuple(t.bind(ref))
|
|
109
|
+
return (t[ref],)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@overload
|
|
113
|
+
def bind_one(t: ibis.Table, ref: Any, /) -> ibis.Column: ...
|
|
114
|
+
@overload
|
|
115
|
+
def bind_one(t: ibis.Deferred, ref: Any, /) -> ibis.Deferred: ...
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def bind_one(t: ibis.Deferred | ibis.Table, ref: Any) -> ibis.Column | ibis.Deferred:
|
|
119
|
+
"""Like bind(), but ensure that exactly one column is returned."""
|
|
120
|
+
cols = bind(t, ref)
|
|
121
|
+
if len(cols) != 1:
|
|
122
|
+
raise ValueError(f"Expected 1 column, got {len(cols)} from {ref}")
|
|
123
|
+
return cols[0]
|
|
98
124
|
|
|
99
125
|
|
|
100
126
|
def get_column(
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from abc import ABCMeta
|
|
4
|
-
from
|
|
4
|
+
from collections.abc import Iterable, Iterator
|
|
5
|
+
from typing import Generic, Literal, Type, TypeVar, overload
|
|
5
6
|
|
|
6
7
|
import ibis
|
|
7
8
|
from ibis.expr import types as ir
|
|
@@ -320,7 +321,7 @@ def _is_stringy(v):
|
|
|
320
321
|
MatchLevelT = TypeVar("MatchLevelT", bound=MatchLevel)
|
|
321
322
|
|
|
322
323
|
|
|
323
|
-
class LevelComparer:
|
|
324
|
+
class LevelComparer(Generic[MatchLevelT]):
|
|
324
325
|
"""
|
|
325
326
|
Assigns a MatchLevel to record pairs based on one dimension, e.g. *name*
|
|
326
327
|
"""
|
|
@@ -373,7 +374,7 @@ class LevelComparer:
|
|
|
373
374
|
if representation is None:
|
|
374
375
|
representation = self.representation
|
|
375
376
|
|
|
376
|
-
cases = [(
|
|
377
|
+
cases = [(_util.bind_one(pairs, c), level) for c, level in self.cases]
|
|
377
378
|
if representation == "string":
|
|
378
379
|
cases = [(c, level.as_string()) for c, level in cases]
|
|
379
380
|
elif representation == "integer":
|
|
@@ -72,7 +72,7 @@ def _featurize(
|
|
|
72
72
|
# for every time it appears in the SQL. See https://github.com/duckdb/duckdb/discussions/14649.
|
|
73
73
|
# So, if we did one .mutate(), we would end up with like literally 100 regex
|
|
74
74
|
# evaluations in the SQL, which is 100x slower than evaluating the regex once.
|
|
75
|
-
input_column =
|
|
75
|
+
input_column = _util.bind_one(t, input_column)
|
|
76
76
|
t = t.mutate(_parsed=parse_street1_re(input_column.street1), _cleaned=input_column)
|
|
77
77
|
t = t.mutate(
|
|
78
78
|
__address_featured=ibis.struct(
|
|
@@ -52,7 +52,7 @@ class ColumnStats:
|
|
|
52
52
|
self,
|
|
53
53
|
table: ibis.Table,
|
|
54
54
|
*,
|
|
55
|
-
column: str | ibis.Deferred | ibis.Column = None,
|
|
55
|
+
column: str | ibis.Deferred | ibis.Column | None = None,
|
|
56
56
|
name_as: str | None = None,
|
|
57
57
|
default: Literal["1/N"] | int | float = "1/N",
|
|
58
58
|
) -> ibis.Table:
|
|
@@ -70,7 +70,7 @@ class ColumnStats:
|
|
|
70
70
|
else:
|
|
71
71
|
default = ibis.literal(default, "float64")
|
|
72
72
|
|
|
73
|
-
table_column =
|
|
73
|
+
table_column = _util.bind_one(table, column)
|
|
74
74
|
|
|
75
75
|
unique_name = _util.unique_name("join_key")
|
|
76
76
|
# TODO: this could be factored out into a join_lookup() function
|
|
@@ -384,7 +384,11 @@ class LinkCountsTable(TableWrapper):
|
|
|
384
384
|
else:
|
|
385
385
|
subtitle = "eg 'there were 1000 records with 0 links, 500 with 1 link, 100 with 2 links, ...'" # noqa: E501
|
|
386
386
|
|
|
387
|
-
frac_records
|
|
387
|
+
frac_records: ir.StringValue = (
|
|
388
|
+
(self.n_records / total_records * 100).cast(int).cast(str)
|
|
389
|
+
if total_records > 0
|
|
390
|
+
else ibis.literal("0")
|
|
391
|
+
)
|
|
388
392
|
t = self.mutate(
|
|
389
393
|
frac_records=frac_records,
|
|
390
394
|
explanation=(
|
|
@@ -392,7 +396,7 @@ class LinkCountsTable(TableWrapper):
|
|
|
392
396
|
+ f"{total_records:_} records, "
|
|
393
397
|
+ self.n_records.cast(str)
|
|
394
398
|
+ " ("
|
|
395
|
-
+
|
|
399
|
+
+ frac_records
|
|
396
400
|
+ "%) had "
|
|
397
401
|
+ self.n_links.cast(str)
|
|
398
402
|
+ " links."
|
|
@@ -282,8 +282,8 @@ class Updates(TableWrapper):
|
|
|
282
282
|
|
|
283
283
|
def is_changed(self, column: str, /) -> ibis.ir.BooleanColumn:
|
|
284
284
|
"""Is column.before different from column.after? Never returns NULL."""
|
|
285
|
-
|
|
286
|
-
return is_changed(
|
|
285
|
+
resolved_col = _util.bind_one(self._t, column)
|
|
286
|
+
return is_changed(resolved_col)
|
|
287
287
|
|
|
288
288
|
def filter(self, *args, **kwargs):
|
|
289
289
|
return self.__class__(
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "mismo"
|
|
3
3
|
description = "The SQL/Ibis powered sklearn of record linkage."
|
|
4
|
-
version = "0.2.7.
|
|
4
|
+
version = "0.2.7.dev3"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10" # ibis>=9.1 requires python >=3.10
|
|
7
7
|
license = { text = "LGPL-3.0-or-later" }
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|