matchbox-db 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matchbox/__init__.py +13 -0
- matchbox/client/__init__.py +9 -0
- matchbox/client/_handler.py +302 -0
- matchbox/client/_settings.py +26 -0
- matchbox/client/clean/__init__.py +29 -0
- matchbox/client/clean/lib.py +191 -0
- matchbox/client/clean/steps/__init__.py +71 -0
- matchbox/client/clean/steps/clean_basic.py +508 -0
- matchbox/client/clean/steps/clean_basic_original.py +128 -0
- matchbox/client/clean/utils.py +158 -0
- matchbox/client/helpers/__init__.py +15 -0
- matchbox/client/helpers/cleaner.py +60 -0
- matchbox/client/helpers/comparison.py +47 -0
- matchbox/client/helpers/index.py +68 -0
- matchbox/client/helpers/selector.py +253 -0
- matchbox/client/models/__init__.py +1 -0
- matchbox/client/models/dedupers/__init__.py +5 -0
- matchbox/client/models/dedupers/base.py +54 -0
- matchbox/client/models/dedupers/naive.py +83 -0
- matchbox/client/models/linkers/__init__.py +9 -0
- matchbox/client/models/linkers/base.py +55 -0
- matchbox/client/models/linkers/deterministic.py +93 -0
- matchbox/client/models/linkers/splinklinker.py +253 -0
- matchbox/client/models/linkers/weighteddeterministic.py +166 -0
- matchbox/client/models/models.py +168 -0
- matchbox/client/results.py +217 -0
- matchbox/client/visualisation.py +41 -0
- matchbox/common/__init__.py +1 -0
- matchbox/common/arrow.py +24 -0
- matchbox/common/db.py +121 -0
- matchbox/common/dtos.py +225 -0
- matchbox/common/exceptions.py +176 -0
- matchbox/common/factories/__init__.py +1 -0
- matchbox/common/factories/dags.py +137 -0
- matchbox/common/factories/entities.py +629 -0
- matchbox/common/factories/models.py +945 -0
- matchbox/common/factories/sources.py +660 -0
- matchbox/common/graph.py +61 -0
- matchbox/common/hash.py +156 -0
- matchbox/common/logging.py +72 -0
- matchbox/common/sources.py +333 -0
- matchbox/common/transform.py +406 -0
- matchbox/server/__init__.py +15 -0
- matchbox/server/api/__init__.py +5 -0
- matchbox/server/api/arrow.py +75 -0
- matchbox/server/api/cache.py +196 -0
- matchbox/server/api/routes.py +674 -0
- matchbox/server/base.py +459 -0
- matchbox/server/postgresql/__init__.py +8 -0
- matchbox/server/postgresql/adapter.py +456 -0
- matchbox/server/postgresql/benchmark/__init__.py +1 -0
- matchbox/server/postgresql/benchmark/cluster_pipeline.py +83 -0
- matchbox/server/postgresql/benchmark/generate_tables.py +560 -0
- matchbox/server/postgresql/benchmark/query.py +93 -0
- matchbox/server/postgresql/db.py +102 -0
- matchbox/server/postgresql/mixin.py +19 -0
- matchbox/server/postgresql/orm.py +295 -0
- matchbox/server/postgresql/utils/__init__.py +1 -0
- matchbox/server/postgresql/utils/db.py +308 -0
- matchbox/server/postgresql/utils/insert.py +558 -0
- matchbox/server/postgresql/utils/query.py +584 -0
- matchbox/server/postgresql/utils/results.py +198 -0
- matchbox_db-0.2.2.dist-info/LICENSE +21 -0
- matchbox_db-0.2.2.dist-info/METADATA +160 -0
- matchbox_db-0.2.2.dist-info/RECORD +67 -0
- matchbox_db-0.2.2.dist-info/WHEEL +5 -0
- matchbox_db-0.2.2.dist-info/top_level.txt +1 -0
matchbox/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Matchbox."""
|
|
2
|
+
|
|
3
|
+
from matchbox.common.exceptions import MatchboxClientSettingsException
|
|
4
|
+
from matchbox.common.logging import logger
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
# Environment variables must be loaded first for other imports to work
|
|
8
|
+
from matchbox.client import * # noqa: E402, F403
|
|
9
|
+
except MatchboxClientSettingsException:
|
|
10
|
+
logger.warning(
|
|
11
|
+
"Impossible to initialise client. "
|
|
12
|
+
"Please ignore if running in server mode. Otherwise, check your .env file.",
|
|
13
|
+
)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""All client-side functionalities of Matchbox."""
|
|
2
|
+
|
|
3
|
+
from matchbox.client.helpers.cleaner import process
|
|
4
|
+
from matchbox.client.helpers.index import index
|
|
5
|
+
from matchbox.client.helpers.selector import match, query
|
|
6
|
+
from matchbox.client.models.models import make_model
|
|
7
|
+
from matchbox.client.visualisation import draw_resolution_graph
|
|
8
|
+
|
|
9
|
+
__all__ = ("process", "index", "match", "query", "make_model", "draw_resolution_graph")
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""Functions abstracting the interaction with the server API."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
from pyarrow import Table
|
|
9
|
+
from pyarrow.parquet import read_table
|
|
10
|
+
|
|
11
|
+
from matchbox.client._settings import ClientSettings, settings
|
|
12
|
+
from matchbox.common.arrow import SCHEMA_MB_IDS, table_to_buffer
|
|
13
|
+
from matchbox.common.dtos import (
|
|
14
|
+
BackendRetrievableType,
|
|
15
|
+
ModelAncestor,
|
|
16
|
+
ModelMetadata,
|
|
17
|
+
ModelOperationStatus,
|
|
18
|
+
NotFoundError,
|
|
19
|
+
UploadStatus,
|
|
20
|
+
)
|
|
21
|
+
from matchbox.common.exceptions import (
|
|
22
|
+
MatchboxClientFileError,
|
|
23
|
+
MatchboxDeletionNotConfirmed,
|
|
24
|
+
MatchboxResolutionNotFoundError,
|
|
25
|
+
MatchboxServerFileError,
|
|
26
|
+
MatchboxSourceNotFoundError,
|
|
27
|
+
MatchboxUnhandledServerResponse,
|
|
28
|
+
MatchboxUnparsedClientRequest,
|
|
29
|
+
)
|
|
30
|
+
from matchbox.common.graph import ResolutionGraph
|
|
31
|
+
from matchbox.common.hash import hash_to_base64
|
|
32
|
+
from matchbox.common.sources import Match, Source, SourceAddress
|
|
33
|
+
|
|
34
|
+
URLEncodeHandledType = str | int | float | bytes
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def encode_param_value(
|
|
38
|
+
v: URLEncodeHandledType | Iterable[URLEncodeHandledType],
|
|
39
|
+
) -> str | list[str]:
|
|
40
|
+
if isinstance(v, str):
|
|
41
|
+
return v
|
|
42
|
+
elif isinstance(v, (int, float)):
|
|
43
|
+
return str(v)
|
|
44
|
+
elif isinstance(v, bytes):
|
|
45
|
+
return hash_to_base64(v)
|
|
46
|
+
# Needs to be at the end, so we don't apply it to e.g. strings
|
|
47
|
+
if isinstance(v, Iterable):
|
|
48
|
+
return [encode_param_value(item) for item in v]
|
|
49
|
+
raise ValueError(f"It was not possible to parse {v} as an URL parameter")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def url_params(
|
|
53
|
+
params: dict[str, URLEncodeHandledType | Iterable[URLEncodeHandledType]],
|
|
54
|
+
) -> dict[str, str | list[str]]:
|
|
55
|
+
"""Prepares a dictionary of parameters to be encoded in a URL."""
|
|
56
|
+
non_null = {k: v for k, v in params.items() if v}
|
|
57
|
+
return {k: encode_param_value(v) for k, v in non_null.items()}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def handle_http_code(res: httpx.Response) -> httpx.Response:
|
|
61
|
+
"""Handle HTTP status codes and raise appropriate exceptions."""
|
|
62
|
+
res.read()
|
|
63
|
+
|
|
64
|
+
if 299 >= res.status_code >= 200:
|
|
65
|
+
return res
|
|
66
|
+
|
|
67
|
+
if res.status_code == 400:
|
|
68
|
+
if UploadStatus.model_validate_json(res.content, strict=False):
|
|
69
|
+
error = UploadStatus.model_validate(res.json())
|
|
70
|
+
raise MatchboxServerFileError(error.details)
|
|
71
|
+
else:
|
|
72
|
+
raise RuntimeError(f"Unexpected 400 error: {res.content}")
|
|
73
|
+
|
|
74
|
+
if res.status_code == 404:
|
|
75
|
+
error = NotFoundError.model_validate(res.json())
|
|
76
|
+
if error.entity == BackendRetrievableType.SOURCE:
|
|
77
|
+
raise MatchboxSourceNotFoundError(error.details)
|
|
78
|
+
if error.entity == BackendRetrievableType.RESOLUTION:
|
|
79
|
+
raise MatchboxResolutionNotFoundError(error.details)
|
|
80
|
+
else:
|
|
81
|
+
raise RuntimeError(f"Unexpected 404 error: {error.details}")
|
|
82
|
+
|
|
83
|
+
if res.status_code == 409:
|
|
84
|
+
error = ModelOperationStatus.model_validate(res.json())
|
|
85
|
+
raise MatchboxDeletionNotConfirmed(message=error.details)
|
|
86
|
+
|
|
87
|
+
if res.status_code == 422:
|
|
88
|
+
raise MatchboxUnparsedClientRequest(res.content)
|
|
89
|
+
|
|
90
|
+
raise MatchboxUnhandledServerResponse(res.content)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def create_client(settings: ClientSettings) -> httpx.Client:
|
|
94
|
+
"""Create an HTTPX client with proper configuration."""
|
|
95
|
+
return httpx.Client(
|
|
96
|
+
base_url=settings.api_root,
|
|
97
|
+
timeout=settings.timeout,
|
|
98
|
+
event_hooks={"response": [handle_http_code]},
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
CLIENT = create_client(settings=settings)
|
|
103
|
+
|
|
104
|
+
# Retrieval
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def query(
|
|
108
|
+
source_address: SourceAddress,
|
|
109
|
+
resolution_name: str | None = None,
|
|
110
|
+
threshold: int | None = None,
|
|
111
|
+
limit: int | None = None,
|
|
112
|
+
) -> BytesIO:
|
|
113
|
+
res = CLIENT.get(
|
|
114
|
+
"/query",
|
|
115
|
+
params=url_params(
|
|
116
|
+
{
|
|
117
|
+
"full_name": source_address.full_name,
|
|
118
|
+
# Converted to b64 by `url_params()`
|
|
119
|
+
"warehouse_hash_b64": source_address.warehouse_hash,
|
|
120
|
+
"resolution_name": resolution_name,
|
|
121
|
+
"threshold": threshold,
|
|
122
|
+
"limit": limit,
|
|
123
|
+
}
|
|
124
|
+
),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
buffer = BytesIO(res.content)
|
|
128
|
+
table = read_table(buffer)
|
|
129
|
+
|
|
130
|
+
if not table.schema.equals(SCHEMA_MB_IDS):
|
|
131
|
+
raise MatchboxClientFileError(
|
|
132
|
+
message=(
|
|
133
|
+
f"Schema mismatch. Expected:\n{SCHEMA_MB_IDS}\nGot:\n{table.schema}"
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return table
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def match(
|
|
141
|
+
targets: list[SourceAddress],
|
|
142
|
+
source: SourceAddress,
|
|
143
|
+
source_pk: str,
|
|
144
|
+
resolution_name: str,
|
|
145
|
+
threshold: int | None = None,
|
|
146
|
+
) -> Match:
|
|
147
|
+
target_full_names = [t.full_name for t in targets]
|
|
148
|
+
target_warehouse_hashes = [t.warehouse_hash for t in targets]
|
|
149
|
+
|
|
150
|
+
res = CLIENT.get(
|
|
151
|
+
"/match",
|
|
152
|
+
params=url_params(
|
|
153
|
+
{
|
|
154
|
+
"target_full_names": target_full_names,
|
|
155
|
+
# Converted to b64 by `url_params()`
|
|
156
|
+
"target_warehouse_hashes_b64": target_warehouse_hashes,
|
|
157
|
+
"source_full_name": source.full_name,
|
|
158
|
+
# Converted to b64 by `url_params()`
|
|
159
|
+
"source_warehouse_hash_b64": source.warehouse_hash,
|
|
160
|
+
"source_pk": source_pk,
|
|
161
|
+
"resolution_name": resolution_name,
|
|
162
|
+
"threshold": threshold,
|
|
163
|
+
}
|
|
164
|
+
),
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
return [Match.model_validate(m) for m in res.json()]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# Data management
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def index(source: Source, data_hashes: Table) -> UploadStatus:
|
|
174
|
+
"""Index a Source in Matchbox."""
|
|
175
|
+
buffer = table_to_buffer(table=data_hashes)
|
|
176
|
+
|
|
177
|
+
# Upload metadata
|
|
178
|
+
metadata_res = CLIENT.post("/sources", json=source.model_dump())
|
|
179
|
+
|
|
180
|
+
upload = UploadStatus.model_validate(metadata_res.json())
|
|
181
|
+
|
|
182
|
+
# Upload data
|
|
183
|
+
upload_res = CLIENT.post(
|
|
184
|
+
f"/upload/{upload.id}",
|
|
185
|
+
files={"file": (f"{upload.id}.parquet", buffer, "application/octet-stream")},
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Poll until complete with retry/timeout configuration
|
|
189
|
+
status = UploadStatus.model_validate(upload_res.json())
|
|
190
|
+
while status.status not in ["complete", "failed"]:
|
|
191
|
+
status_res = CLIENT.get(f"/upload/{upload.id}/status")
|
|
192
|
+
status = UploadStatus.model_validate(status_res.json())
|
|
193
|
+
|
|
194
|
+
if status.status == "failed":
|
|
195
|
+
raise MatchboxServerFileError(status.details)
|
|
196
|
+
|
|
197
|
+
time.sleep(settings.retry_delay)
|
|
198
|
+
|
|
199
|
+
return status
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def get_source(address: SourceAddress) -> Source:
|
|
203
|
+
warehouse_hash_b64 = hash_to_base64(address.warehouse_hash)
|
|
204
|
+
res = CLIENT.get(f"/sources/{warehouse_hash_b64}/{address.full_name}")
|
|
205
|
+
|
|
206
|
+
return Source.model_validate(res.json())
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def get_resolution_graph() -> ResolutionGraph:
|
|
210
|
+
"""Get the resolution graph from Matchbox."""
|
|
211
|
+
res = CLIENT.get("/report/resolutions")
|
|
212
|
+
return ResolutionGraph.model_validate(res.json())
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# Model management
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def insert_model(model: ModelMetadata) -> ModelOperationStatus:
|
|
219
|
+
"""Insert a model in Matchbox."""
|
|
220
|
+
res = CLIENT.post("/models", json=model.model_dump())
|
|
221
|
+
return ModelOperationStatus.model_validate(res.json())
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def get_model(name: str) -> ModelMetadata:
|
|
225
|
+
res = CLIENT.get(f"/models/{name}")
|
|
226
|
+
return ModelMetadata.model_validate(res.json())
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def add_model_results(name: str, results: Table) -> UploadStatus:
|
|
230
|
+
"""Upload model results in Matchbox."""
|
|
231
|
+
buffer = table_to_buffer(table=results)
|
|
232
|
+
|
|
233
|
+
# Initialise upload
|
|
234
|
+
metadata_res = CLIENT.post(f"/models/{name}/results")
|
|
235
|
+
|
|
236
|
+
upload = UploadStatus.model_validate(metadata_res.json())
|
|
237
|
+
|
|
238
|
+
# Upload data
|
|
239
|
+
upload_res = CLIENT.post(
|
|
240
|
+
f"/upload/{upload.id}",
|
|
241
|
+
files={"file": (f"{upload.id}.parquet", buffer, "application/octet-stream")},
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Poll until complete with retry/timeout configuration
|
|
245
|
+
status = UploadStatus.model_validate(upload_res.json())
|
|
246
|
+
while status.status not in ["complete", "failed"]:
|
|
247
|
+
status_res = CLIENT.get(f"/upload/{upload.id}/status")
|
|
248
|
+
status = UploadStatus.model_validate(status_res.json())
|
|
249
|
+
|
|
250
|
+
if status.status == "failed":
|
|
251
|
+
raise MatchboxServerFileError(status.details)
|
|
252
|
+
|
|
253
|
+
time.sleep(settings.retry_delay)
|
|
254
|
+
|
|
255
|
+
return status
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_model_results(name: str) -> Table:
|
|
259
|
+
"""Get model results from Matchbox."""
|
|
260
|
+
res = CLIENT.get(f"/models/{name}/results")
|
|
261
|
+
buffer = BytesIO(res.content)
|
|
262
|
+
return read_table(buffer)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def set_model_truth(name: str, truth: float) -> ModelOperationStatus:
|
|
266
|
+
"""Set the truth threshold for a model in Matchbox."""
|
|
267
|
+
res = CLIENT.patch(f"/models/{name}/truth", json=truth)
|
|
268
|
+
return ModelOperationStatus.model_validate(res.json())
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def get_model_truth(name: str) -> float:
|
|
272
|
+
"""Get the truth threshold for a model in Matchbox."""
|
|
273
|
+
res = CLIENT.get(f"/models/{name}/truth")
|
|
274
|
+
return res.json()
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def get_model_ancestors(name: str) -> list[ModelAncestor]:
|
|
278
|
+
"""Get the ancestors of a model in Matchbox."""
|
|
279
|
+
res = CLIENT.get(f"/models/{name}/ancestors")
|
|
280
|
+
return [ModelAncestor.model_validate(m) for m in res.json()]
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def set_model_ancestors_cache(
|
|
284
|
+
name: str, ancestors: list[ModelAncestor]
|
|
285
|
+
) -> ModelOperationStatus:
|
|
286
|
+
"""Set the ancestors cache for a model in Matchbox."""
|
|
287
|
+
res = CLIENT.post(
|
|
288
|
+
f"/models/{name}/ancestors_cache", json=[a.model_dump() for a in ancestors]
|
|
289
|
+
)
|
|
290
|
+
return ModelOperationStatus.model_validate(res.json())
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def get_model_ancestors_cache(name: str) -> list[ModelAncestor]:
|
|
294
|
+
"""Get the ancestors cache for a model in Matchbox."""
|
|
295
|
+
res = CLIENT.get(f"/models/{name}/ancestors_cache")
|
|
296
|
+
return [ModelAncestor.model_validate(m) for m in res.json()]
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def delete_model(name: str, certain: bool = False) -> ModelOperationStatus:
|
|
300
|
+
"""Delete a model in Matchbox."""
|
|
301
|
+
res = CLIENT.delete(f"/models/{name}", params={"certain": certain})
|
|
302
|
+
return ModelOperationStatus.model_validate(res.json())
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Module to load client settings from env file."""
|
|
2
|
+
|
|
3
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
4
|
+
|
|
5
|
+
from matchbox.common.exceptions import MatchboxClientSettingsException
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ClientSettings(BaseSettings):
|
|
9
|
+
api_root: str
|
|
10
|
+
timeout: float | None = None
|
|
11
|
+
retry_delay: int = 5
|
|
12
|
+
default_warehouse: str | None = None
|
|
13
|
+
|
|
14
|
+
model_config = SettingsConfigDict(
|
|
15
|
+
extra="ignore",
|
|
16
|
+
env_prefix="MB__CLIENT__",
|
|
17
|
+
env_nested_delimiter="__",
|
|
18
|
+
env_file=".env",
|
|
19
|
+
env_file_encoding="utf-8",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
settings = ClientSettings()
|
|
25
|
+
except ValueError as e:
|
|
26
|
+
raise MatchboxClientSettingsException from e
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Library of default cleaning functions."""
|
|
2
|
+
|
|
3
|
+
from matchbox.client.clean.lib import (
|
|
4
|
+
company_name,
|
|
5
|
+
company_number,
|
|
6
|
+
drop,
|
|
7
|
+
extract_cdms_number_to_new,
|
|
8
|
+
extract_company_number_to_new,
|
|
9
|
+
extract_duns_number_to_new,
|
|
10
|
+
postcode,
|
|
11
|
+
postcode_to_area,
|
|
12
|
+
)
|
|
13
|
+
from matchbox.client.clean.utils import alias, cleaning_function, unnest_renest
|
|
14
|
+
|
|
15
|
+
__all__ = (
|
|
16
|
+
# Cleaning functions
|
|
17
|
+
"company_name",
|
|
18
|
+
"company_number",
|
|
19
|
+
"drop",
|
|
20
|
+
"extract_cdms_number_to_new",
|
|
21
|
+
"extract_company_number_to_new",
|
|
22
|
+
"extract_duns_number_to_new",
|
|
23
|
+
"postcode",
|
|
24
|
+
"postcode_to_area",
|
|
25
|
+
# Utility functions
|
|
26
|
+
"alias",
|
|
27
|
+
"cleaning_function",
|
|
28
|
+
"unnest_renest",
|
|
29
|
+
)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Implementation of default cleaning functions."""
|
|
2
|
+
|
|
3
|
+
from functools import partial
|
|
4
|
+
|
|
5
|
+
from pandas import DataFrame
|
|
6
|
+
|
|
7
|
+
from matchbox.client.clean import steps
|
|
8
|
+
from matchbox.client.clean import utils as cu
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def company_name(
|
|
12
|
+
df: DataFrame,
|
|
13
|
+
column: str,
|
|
14
|
+
column_secondary: str = None,
|
|
15
|
+
stopwords: str = cu.STOPWORDS,
|
|
16
|
+
) -> DataFrame:
|
|
17
|
+
"""Standard cleaning function for company names.
|
|
18
|
+
|
|
19
|
+
* Lower case, remove punctuation & tokenise the company name into an array
|
|
20
|
+
* Extract tokens into: 'unusual' and 'stopwords'. Dedupe. Sort alphabetically
|
|
21
|
+
* Untokenise the unusual words back to a string
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
df: a dataframe
|
|
25
|
+
column: a column containing the company's main name
|
|
26
|
+
column_secondary: a column containing an array of the company's
|
|
27
|
+
secondary names
|
|
28
|
+
stopwords: a list of stopwords to use for this clean
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
dataframe: the same as went in, but cleaned
|
|
32
|
+
"""
|
|
33
|
+
remove_stopwords = partial(steps.remove_stopwords, stopwords=stopwords)
|
|
34
|
+
|
|
35
|
+
clean_primary = cu.cleaning_function(
|
|
36
|
+
steps.clean_punctuation,
|
|
37
|
+
steps.expand_abbreviations,
|
|
38
|
+
steps.tokenise, # returns array
|
|
39
|
+
remove_stopwords,
|
|
40
|
+
steps.list_join_to_string, # returns col
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
clean_secondary = cu.unnest_renest(clean_primary)
|
|
44
|
+
|
|
45
|
+
df = clean_primary(df, column)
|
|
46
|
+
|
|
47
|
+
if column_secondary is not None:
|
|
48
|
+
df = clean_secondary(df, column_secondary)
|
|
49
|
+
|
|
50
|
+
return df
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def company_number(df: DataFrame, column: str) -> DataFrame:
|
|
54
|
+
"""Remove non-numbers, and then leading zeroes.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
df: a dataframe
|
|
58
|
+
column: a column containing a company number
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
dataframe: the same as went in, but cleaned
|
|
62
|
+
"""
|
|
63
|
+
clean_number = cu.cleaning_function(steps.remove_notnumbers_leadingzeroes)
|
|
64
|
+
|
|
65
|
+
df = clean_number(df, column)
|
|
66
|
+
|
|
67
|
+
return df
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def postcode(df: DataFrame, column: str) -> DataFrame:
|
|
71
|
+
"""Removes all punctuation, converts to upper, removes all spaces.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
df: a dataframe
|
|
75
|
+
column: a column containing a postcode
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
dataframe: the same as went in, but cleaned
|
|
79
|
+
|
|
80
|
+
"""
|
|
81
|
+
clean_postcode = cu.cleaning_function(
|
|
82
|
+
steps.punctuation_to_spaces, steps.to_upper, steps.remove_whitespace
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
df = clean_postcode(df, column)
|
|
86
|
+
|
|
87
|
+
return df
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def postcode_to_area(df: DataFrame, column: str) -> DataFrame:
|
|
91
|
+
"""Extracts postcode area from a postcode.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
df: a dataframe
|
|
95
|
+
column: a column containing a postcode
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
dataframe: the same as went in, but cleaned
|
|
99
|
+
"""
|
|
100
|
+
extract_area = cu.cleaning_function(steps.get_postcode_area)
|
|
101
|
+
|
|
102
|
+
df = extract_area(df, column)
|
|
103
|
+
|
|
104
|
+
return df
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def extract_company_number_to_new(
|
|
108
|
+
df: DataFrame, column: str, new_column: str
|
|
109
|
+
) -> DataFrame:
|
|
110
|
+
"""Detects the Companies House CRN in a column and moves it to a new column.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
df: a dataframe
|
|
114
|
+
column: a column containing some company numbers
|
|
115
|
+
new_column: the name of the column to add
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
dataframe: the same as went in with a new column for CRNs
|
|
119
|
+
"""
|
|
120
|
+
clean_crn = cu.cleaning_function(
|
|
121
|
+
steps.clean_punctuation_except_hyphens,
|
|
122
|
+
steps.to_upper,
|
|
123
|
+
steps.filter_company_number,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
clean_crn_aliased = cu.alias(clean_crn, alias=new_column)
|
|
127
|
+
|
|
128
|
+
df = clean_crn_aliased(df, column)
|
|
129
|
+
|
|
130
|
+
return df
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def extract_duns_number_to_new(
|
|
134
|
+
df: DataFrame, column: str, new_column: str
|
|
135
|
+
) -> DataFrame:
|
|
136
|
+
"""Detects the Dun & Bradstreet DUNS nuber in a column and moves it to a new column.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
df: a dataframe
|
|
140
|
+
column: a column containing some DUNS numbers
|
|
141
|
+
new_column: the name of the column to add
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
dataframe: the same as went in with a new column for DUNs numbers
|
|
145
|
+
"""
|
|
146
|
+
clean_duns = cu.cleaning_function(
|
|
147
|
+
steps.clean_punctuation_except_hyphens, steps.to_upper, steps.filter_duns_number
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
clean_duns_aliased = cu.alias(clean_duns, alias=new_column)
|
|
151
|
+
|
|
152
|
+
df = clean_duns_aliased(df, column)
|
|
153
|
+
|
|
154
|
+
return df
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def extract_cdms_number_to_new(
|
|
158
|
+
df: DataFrame, column: str, new_column: str
|
|
159
|
+
) -> DataFrame:
|
|
160
|
+
"""Detects the CDMS nuber in a column and moves it to a new column.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
df: a dataframe
|
|
164
|
+
column: a column containing some CDMS numbers
|
|
165
|
+
new_column: the name of the column to add
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
dataframe: the same as went in with a new column for CDMS numbers
|
|
169
|
+
"""
|
|
170
|
+
clean_cdms = cu.cleaning_function(
|
|
171
|
+
steps.clean_punctuation_except_hyphens, steps.to_upper, steps.filter_cdms_number
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
clean_cdms_aliased = cu.alias(clean_cdms, alias=new_column)
|
|
175
|
+
|
|
176
|
+
df = clean_cdms_aliased(df, column)
|
|
177
|
+
|
|
178
|
+
return df
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def drop(df: DataFrame, column: str) -> DataFrame:
|
|
182
|
+
"""Drops the column from the dataframe.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
df: a dataframe
|
|
186
|
+
column: a column
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
dataframe: the same as went in without the column
|
|
190
|
+
"""
|
|
191
|
+
return df.drop(columns=[column])
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Low-level components of default cleaning functions."""
|
|
2
|
+
|
|
3
|
+
from matchbox.client.clean.steps.clean_basic import (
|
|
4
|
+
array_except,
|
|
5
|
+
array_intersect,
|
|
6
|
+
clean_punctuation,
|
|
7
|
+
clean_punctuation_except_hyphens,
|
|
8
|
+
dedupe_and_sort,
|
|
9
|
+
expand_abbreviations,
|
|
10
|
+
filter_cdms_number,
|
|
11
|
+
filter_company_number,
|
|
12
|
+
filter_duns_number,
|
|
13
|
+
get_digits_only,
|
|
14
|
+
get_low_freq_char_sig,
|
|
15
|
+
get_postcode_area,
|
|
16
|
+
list_join_to_string,
|
|
17
|
+
periods_to_nothing,
|
|
18
|
+
punctuation_to_spaces,
|
|
19
|
+
regex_extract_list_of_strings,
|
|
20
|
+
regex_remove_list_of_strings,
|
|
21
|
+
remove_notnumbers_leadingzeroes,
|
|
22
|
+
remove_stopwords,
|
|
23
|
+
remove_whitespace,
|
|
24
|
+
to_lower,
|
|
25
|
+
to_upper,
|
|
26
|
+
tokenise,
|
|
27
|
+
trim,
|
|
28
|
+
)
|
|
29
|
+
from matchbox.client.clean.steps.clean_basic_original import (
|
|
30
|
+
cms_original_clean_cdms_id,
|
|
31
|
+
cms_original_clean_ch_id,
|
|
32
|
+
cms_original_clean_company_name_ch,
|
|
33
|
+
cms_original_clean_company_name_general,
|
|
34
|
+
cms_original_clean_email,
|
|
35
|
+
cms_original_clean_postcode,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
__all__ = (
|
|
39
|
+
# Basic steps
|
|
40
|
+
"array_except",
|
|
41
|
+
"array_intersect",
|
|
42
|
+
"periods_to_nothing",
|
|
43
|
+
"punctuation_to_spaces",
|
|
44
|
+
"clean_punctuation",
|
|
45
|
+
"clean_punctuation_except_hyphens",
|
|
46
|
+
"dedupe_and_sort",
|
|
47
|
+
"expand_abbreviations",
|
|
48
|
+
"filter_cdms_number",
|
|
49
|
+
"filter_company_number",
|
|
50
|
+
"filter_duns_number",
|
|
51
|
+
"get_digits_only",
|
|
52
|
+
"get_low_freq_char_sig",
|
|
53
|
+
"get_postcode_area",
|
|
54
|
+
"list_join_to_string",
|
|
55
|
+
"regex_extract_list_of_strings",
|
|
56
|
+
"regex_remove_list_of_strings",
|
|
57
|
+
"remove_notnumbers_leadingzeroes",
|
|
58
|
+
"remove_stopwords",
|
|
59
|
+
"remove_whitespace",
|
|
60
|
+
"to_lower",
|
|
61
|
+
"to_upper",
|
|
62
|
+
"tokenise",
|
|
63
|
+
"trim",
|
|
64
|
+
# Original CMS steps
|
|
65
|
+
"cms_original_clean_cdms_id",
|
|
66
|
+
"cms_original_clean_ch_id",
|
|
67
|
+
"cms_original_clean_company_name_ch",
|
|
68
|
+
"cms_original_clean_company_name_general",
|
|
69
|
+
"cms_original_clean_email",
|
|
70
|
+
"cms_original_clean_postcode",
|
|
71
|
+
)
|