matchbox-db 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. matchbox/__init__.py +13 -0
  2. matchbox/client/__init__.py +9 -0
  3. matchbox/client/_handler.py +302 -0
  4. matchbox/client/_settings.py +26 -0
  5. matchbox/client/clean/__init__.py +29 -0
  6. matchbox/client/clean/lib.py +191 -0
  7. matchbox/client/clean/steps/__init__.py +71 -0
  8. matchbox/client/clean/steps/clean_basic.py +508 -0
  9. matchbox/client/clean/steps/clean_basic_original.py +128 -0
  10. matchbox/client/clean/utils.py +158 -0
  11. matchbox/client/helpers/__init__.py +15 -0
  12. matchbox/client/helpers/cleaner.py +60 -0
  13. matchbox/client/helpers/comparison.py +47 -0
  14. matchbox/client/helpers/index.py +68 -0
  15. matchbox/client/helpers/selector.py +253 -0
  16. matchbox/client/models/__init__.py +1 -0
  17. matchbox/client/models/dedupers/__init__.py +5 -0
  18. matchbox/client/models/dedupers/base.py +54 -0
  19. matchbox/client/models/dedupers/naive.py +83 -0
  20. matchbox/client/models/linkers/__init__.py +9 -0
  21. matchbox/client/models/linkers/base.py +55 -0
  22. matchbox/client/models/linkers/deterministic.py +93 -0
  23. matchbox/client/models/linkers/splinklinker.py +253 -0
  24. matchbox/client/models/linkers/weighteddeterministic.py +166 -0
  25. matchbox/client/models/models.py +168 -0
  26. matchbox/client/results.py +217 -0
  27. matchbox/client/visualisation.py +41 -0
  28. matchbox/common/__init__.py +1 -0
  29. matchbox/common/arrow.py +24 -0
  30. matchbox/common/db.py +121 -0
  31. matchbox/common/dtos.py +225 -0
  32. matchbox/common/exceptions.py +176 -0
  33. matchbox/common/factories/__init__.py +1 -0
  34. matchbox/common/factories/dags.py +137 -0
  35. matchbox/common/factories/entities.py +629 -0
  36. matchbox/common/factories/models.py +945 -0
  37. matchbox/common/factories/sources.py +660 -0
  38. matchbox/common/graph.py +61 -0
  39. matchbox/common/hash.py +156 -0
  40. matchbox/common/logging.py +72 -0
  41. matchbox/common/sources.py +333 -0
  42. matchbox/common/transform.py +406 -0
  43. matchbox/server/__init__.py +15 -0
  44. matchbox/server/api/__init__.py +5 -0
  45. matchbox/server/api/arrow.py +75 -0
  46. matchbox/server/api/cache.py +196 -0
  47. matchbox/server/api/routes.py +674 -0
  48. matchbox/server/base.py +459 -0
  49. matchbox/server/postgresql/__init__.py +8 -0
  50. matchbox/server/postgresql/adapter.py +456 -0
  51. matchbox/server/postgresql/benchmark/__init__.py +1 -0
  52. matchbox/server/postgresql/benchmark/cluster_pipeline.py +83 -0
  53. matchbox/server/postgresql/benchmark/generate_tables.py +560 -0
  54. matchbox/server/postgresql/benchmark/query.py +93 -0
  55. matchbox/server/postgresql/db.py +102 -0
  56. matchbox/server/postgresql/mixin.py +19 -0
  57. matchbox/server/postgresql/orm.py +295 -0
  58. matchbox/server/postgresql/utils/__init__.py +1 -0
  59. matchbox/server/postgresql/utils/db.py +308 -0
  60. matchbox/server/postgresql/utils/insert.py +558 -0
  61. matchbox/server/postgresql/utils/query.py +584 -0
  62. matchbox/server/postgresql/utils/results.py +198 -0
  63. matchbox_db-0.2.2.dist-info/LICENSE +21 -0
  64. matchbox_db-0.2.2.dist-info/METADATA +160 -0
  65. matchbox_db-0.2.2.dist-info/RECORD +67 -0
  66. matchbox_db-0.2.2.dist-info/WHEEL +5 -0
  67. matchbox_db-0.2.2.dist-info/top_level.txt +1 -0
matchbox/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ """Matchbox."""
2
+
3
+ from matchbox.common.exceptions import MatchboxClientSettingsException
4
+ from matchbox.common.logging import logger
5
+
6
+ try:
7
+ # Environment variables must be loaded first for other imports to work
8
+ from matchbox.client import * # noqa: E402, F403
9
+ except MatchboxClientSettingsException:
10
+ logger.warning(
11
+ "Impossible to initialise client. "
12
+ "Please ignore if running in server mode. Otherwise, check your .env file.",
13
+ )
@@ -0,0 +1,9 @@
1
+ """All client-side functionalities of Matchbox."""
2
+
3
+ from matchbox.client.helpers.cleaner import process
4
+ from matchbox.client.helpers.index import index
5
+ from matchbox.client.helpers.selector import match, query
6
+ from matchbox.client.models.models import make_model
7
+ from matchbox.client.visualisation import draw_resolution_graph
8
+
9
+ __all__ = ("process", "index", "match", "query", "make_model", "draw_resolution_graph")
@@ -0,0 +1,302 @@
1
+ """Functions abstracting the interaction with the server API."""
2
+
3
+ import time
4
+ from collections.abc import Iterable
5
+ from io import BytesIO
6
+
7
+ import httpx
8
+ from pyarrow import Table
9
+ from pyarrow.parquet import read_table
10
+
11
+ from matchbox.client._settings import ClientSettings, settings
12
+ from matchbox.common.arrow import SCHEMA_MB_IDS, table_to_buffer
13
+ from matchbox.common.dtos import (
14
+ BackendRetrievableType,
15
+ ModelAncestor,
16
+ ModelMetadata,
17
+ ModelOperationStatus,
18
+ NotFoundError,
19
+ UploadStatus,
20
+ )
21
+ from matchbox.common.exceptions import (
22
+ MatchboxClientFileError,
23
+ MatchboxDeletionNotConfirmed,
24
+ MatchboxResolutionNotFoundError,
25
+ MatchboxServerFileError,
26
+ MatchboxSourceNotFoundError,
27
+ MatchboxUnhandledServerResponse,
28
+ MatchboxUnparsedClientRequest,
29
+ )
30
+ from matchbox.common.graph import ResolutionGraph
31
+ from matchbox.common.hash import hash_to_base64
32
+ from matchbox.common.sources import Match, Source, SourceAddress
33
+
34
+ URLEncodeHandledType = str | int | float | bytes
35
+
36
+
37
+ def encode_param_value(
38
+ v: URLEncodeHandledType | Iterable[URLEncodeHandledType],
39
+ ) -> str | list[str]:
40
+ if isinstance(v, str):
41
+ return v
42
+ elif isinstance(v, (int, float)):
43
+ return str(v)
44
+ elif isinstance(v, bytes):
45
+ return hash_to_base64(v)
46
+ # Needs to be at the end, so we don't apply it to e.g. strings
47
+ if isinstance(v, Iterable):
48
+ return [encode_param_value(item) for item in v]
49
+ raise ValueError(f"It was not possible to parse {v} as an URL parameter")
50
+
51
+
52
+ def url_params(
53
+ params: dict[str, URLEncodeHandledType | Iterable[URLEncodeHandledType]],
54
+ ) -> dict[str, str | list[str]]:
55
+ """Prepares a dictionary of parameters to be encoded in a URL."""
56
+ non_null = {k: v for k, v in params.items() if v}
57
+ return {k: encode_param_value(v) for k, v in non_null.items()}
58
+
59
+
60
+ def handle_http_code(res: httpx.Response) -> httpx.Response:
61
+ """Handle HTTP status codes and raise appropriate exceptions."""
62
+ res.read()
63
+
64
+ if 299 >= res.status_code >= 200:
65
+ return res
66
+
67
+ if res.status_code == 400:
68
+ if UploadStatus.model_validate_json(res.content, strict=False):
69
+ error = UploadStatus.model_validate(res.json())
70
+ raise MatchboxServerFileError(error.details)
71
+ else:
72
+ raise RuntimeError(f"Unexpected 400 error: {res.content}")
73
+
74
+ if res.status_code == 404:
75
+ error = NotFoundError.model_validate(res.json())
76
+ if error.entity == BackendRetrievableType.SOURCE:
77
+ raise MatchboxSourceNotFoundError(error.details)
78
+ if error.entity == BackendRetrievableType.RESOLUTION:
79
+ raise MatchboxResolutionNotFoundError(error.details)
80
+ else:
81
+ raise RuntimeError(f"Unexpected 404 error: {error.details}")
82
+
83
+ if res.status_code == 409:
84
+ error = ModelOperationStatus.model_validate(res.json())
85
+ raise MatchboxDeletionNotConfirmed(message=error.details)
86
+
87
+ if res.status_code == 422:
88
+ raise MatchboxUnparsedClientRequest(res.content)
89
+
90
+ raise MatchboxUnhandledServerResponse(res.content)
91
+
92
+
93
+ def create_client(settings: ClientSettings) -> httpx.Client:
94
+ """Create an HTTPX client with proper configuration."""
95
+ return httpx.Client(
96
+ base_url=settings.api_root,
97
+ timeout=settings.timeout,
98
+ event_hooks={"response": [handle_http_code]},
99
+ )
100
+
101
+
102
+ CLIENT = create_client(settings=settings)
103
+
104
+ # Retrieval
105
+
106
+
107
+ def query(
108
+ source_address: SourceAddress,
109
+ resolution_name: str | None = None,
110
+ threshold: int | None = None,
111
+ limit: int | None = None,
112
+ ) -> BytesIO:
113
+ res = CLIENT.get(
114
+ "/query",
115
+ params=url_params(
116
+ {
117
+ "full_name": source_address.full_name,
118
+ # Converted to b64 by `url_params()`
119
+ "warehouse_hash_b64": source_address.warehouse_hash,
120
+ "resolution_name": resolution_name,
121
+ "threshold": threshold,
122
+ "limit": limit,
123
+ }
124
+ ),
125
+ )
126
+
127
+ buffer = BytesIO(res.content)
128
+ table = read_table(buffer)
129
+
130
+ if not table.schema.equals(SCHEMA_MB_IDS):
131
+ raise MatchboxClientFileError(
132
+ message=(
133
+ f"Schema mismatch. Expected:\n{SCHEMA_MB_IDS}\nGot:\n{table.schema}"
134
+ )
135
+ )
136
+
137
+ return table
138
+
139
+
140
+ def match(
141
+ targets: list[SourceAddress],
142
+ source: SourceAddress,
143
+ source_pk: str,
144
+ resolution_name: str,
145
+ threshold: int | None = None,
146
+ ) -> Match:
147
+ target_full_names = [t.full_name for t in targets]
148
+ target_warehouse_hashes = [t.warehouse_hash for t in targets]
149
+
150
+ res = CLIENT.get(
151
+ "/match",
152
+ params=url_params(
153
+ {
154
+ "target_full_names": target_full_names,
155
+ # Converted to b64 by `url_params()`
156
+ "target_warehouse_hashes_b64": target_warehouse_hashes,
157
+ "source_full_name": source.full_name,
158
+ # Converted to b64 by `url_params()`
159
+ "source_warehouse_hash_b64": source.warehouse_hash,
160
+ "source_pk": source_pk,
161
+ "resolution_name": resolution_name,
162
+ "threshold": threshold,
163
+ }
164
+ ),
165
+ )
166
+
167
+ return [Match.model_validate(m) for m in res.json()]
168
+
169
+
170
+ # Data management
171
+
172
+
173
+ def index(source: Source, data_hashes: Table) -> UploadStatus:
174
+ """Index a Source in Matchbox."""
175
+ buffer = table_to_buffer(table=data_hashes)
176
+
177
+ # Upload metadata
178
+ metadata_res = CLIENT.post("/sources", json=source.model_dump())
179
+
180
+ upload = UploadStatus.model_validate(metadata_res.json())
181
+
182
+ # Upload data
183
+ upload_res = CLIENT.post(
184
+ f"/upload/{upload.id}",
185
+ files={"file": (f"{upload.id}.parquet", buffer, "application/octet-stream")},
186
+ )
187
+
188
+ # Poll until complete with retry/timeout configuration
189
+ status = UploadStatus.model_validate(upload_res.json())
190
+ while status.status not in ["complete", "failed"]:
191
+ status_res = CLIENT.get(f"/upload/{upload.id}/status")
192
+ status = UploadStatus.model_validate(status_res.json())
193
+
194
+ if status.status == "failed":
195
+ raise MatchboxServerFileError(status.details)
196
+
197
+ time.sleep(settings.retry_delay)
198
+
199
+ return status
200
+
201
+
202
+ def get_source(address: SourceAddress) -> Source:
203
+ warehouse_hash_b64 = hash_to_base64(address.warehouse_hash)
204
+ res = CLIENT.get(f"/sources/{warehouse_hash_b64}/{address.full_name}")
205
+
206
+ return Source.model_validate(res.json())
207
+
208
+
209
+ def get_resolution_graph() -> ResolutionGraph:
210
+ """Get the resolution graph from Matchbox."""
211
+ res = CLIENT.get("/report/resolutions")
212
+ return ResolutionGraph.model_validate(res.json())
213
+
214
+
215
+ # Model management
216
+
217
+
218
+ def insert_model(model: ModelMetadata) -> ModelOperationStatus:
219
+ """Insert a model in Matchbox."""
220
+ res = CLIENT.post("/models", json=model.model_dump())
221
+ return ModelOperationStatus.model_validate(res.json())
222
+
223
+
224
+ def get_model(name: str) -> ModelMetadata:
225
+ res = CLIENT.get(f"/models/{name}")
226
+ return ModelMetadata.model_validate(res.json())
227
+
228
+
229
+ def add_model_results(name: str, results: Table) -> UploadStatus:
230
+ """Upload model results in Matchbox."""
231
+ buffer = table_to_buffer(table=results)
232
+
233
+ # Initialise upload
234
+ metadata_res = CLIENT.post(f"/models/{name}/results")
235
+
236
+ upload = UploadStatus.model_validate(metadata_res.json())
237
+
238
+ # Upload data
239
+ upload_res = CLIENT.post(
240
+ f"/upload/{upload.id}",
241
+ files={"file": (f"{upload.id}.parquet", buffer, "application/octet-stream")},
242
+ )
243
+
244
+ # Poll until complete with retry/timeout configuration
245
+ status = UploadStatus.model_validate(upload_res.json())
246
+ while status.status not in ["complete", "failed"]:
247
+ status_res = CLIENT.get(f"/upload/{upload.id}/status")
248
+ status = UploadStatus.model_validate(status_res.json())
249
+
250
+ if status.status == "failed":
251
+ raise MatchboxServerFileError(status.details)
252
+
253
+ time.sleep(settings.retry_delay)
254
+
255
+ return status
256
+
257
+
258
+ def get_model_results(name: str) -> Table:
259
+ """Get model results from Matchbox."""
260
+ res = CLIENT.get(f"/models/{name}/results")
261
+ buffer = BytesIO(res.content)
262
+ return read_table(buffer)
263
+
264
+
265
+ def set_model_truth(name: str, truth: float) -> ModelOperationStatus:
266
+ """Set the truth threshold for a model in Matchbox."""
267
+ res = CLIENT.patch(f"/models/{name}/truth", json=truth)
268
+ return ModelOperationStatus.model_validate(res.json())
269
+
270
+
271
+ def get_model_truth(name: str) -> float:
272
+ """Get the truth threshold for a model in Matchbox."""
273
+ res = CLIENT.get(f"/models/{name}/truth")
274
+ return res.json()
275
+
276
+
277
+ def get_model_ancestors(name: str) -> list[ModelAncestor]:
278
+ """Get the ancestors of a model in Matchbox."""
279
+ res = CLIENT.get(f"/models/{name}/ancestors")
280
+ return [ModelAncestor.model_validate(m) for m in res.json()]
281
+
282
+
283
+ def set_model_ancestors_cache(
284
+ name: str, ancestors: list[ModelAncestor]
285
+ ) -> ModelOperationStatus:
286
+ """Set the ancestors cache for a model in Matchbox."""
287
+ res = CLIENT.post(
288
+ f"/models/{name}/ancestors_cache", json=[a.model_dump() for a in ancestors]
289
+ )
290
+ return ModelOperationStatus.model_validate(res.json())
291
+
292
+
293
+ def get_model_ancestors_cache(name: str) -> list[ModelAncestor]:
294
+ """Get the ancestors cache for a model in Matchbox."""
295
+ res = CLIENT.get(f"/models/{name}/ancestors_cache")
296
+ return [ModelAncestor.model_validate(m) for m in res.json()]
297
+
298
+
299
+ def delete_model(name: str, certain: bool = False) -> ModelOperationStatus:
300
+ """Delete a model in Matchbox."""
301
+ res = CLIENT.delete(f"/models/{name}", params={"certain": certain})
302
+ return ModelOperationStatus.model_validate(res.json())
@@ -0,0 +1,26 @@
1
+ """Module to load client settings from env file."""
2
+
3
+ from pydantic_settings import BaseSettings, SettingsConfigDict
4
+
5
+ from matchbox.common.exceptions import MatchboxClientSettingsException
6
+
7
+
8
+ class ClientSettings(BaseSettings):
9
+ api_root: str
10
+ timeout: float | None = None
11
+ retry_delay: int = 5
12
+ default_warehouse: str | None = None
13
+
14
+ model_config = SettingsConfigDict(
15
+ extra="ignore",
16
+ env_prefix="MB__CLIENT__",
17
+ env_nested_delimiter="__",
18
+ env_file=".env",
19
+ env_file_encoding="utf-8",
20
+ )
21
+
22
+
23
+ try:
24
+ settings = ClientSettings()
25
+ except ValueError as e:
26
+ raise MatchboxClientSettingsException from e
@@ -0,0 +1,29 @@
1
+ """Library of default cleaning functions."""
2
+
3
+ from matchbox.client.clean.lib import (
4
+ company_name,
5
+ company_number,
6
+ drop,
7
+ extract_cdms_number_to_new,
8
+ extract_company_number_to_new,
9
+ extract_duns_number_to_new,
10
+ postcode,
11
+ postcode_to_area,
12
+ )
13
+ from matchbox.client.clean.utils import alias, cleaning_function, unnest_renest
14
+
15
+ __all__ = (
16
+ # Cleaning functions
17
+ "company_name",
18
+ "company_number",
19
+ "drop",
20
+ "extract_cdms_number_to_new",
21
+ "extract_company_number_to_new",
22
+ "extract_duns_number_to_new",
23
+ "postcode",
24
+ "postcode_to_area",
25
+ # Utility functions
26
+ "alias",
27
+ "cleaning_function",
28
+ "unnest_renest",
29
+ )
@@ -0,0 +1,191 @@
1
+ """Implementation of default cleaning functions."""
2
+
3
+ from functools import partial
4
+
5
+ from pandas import DataFrame
6
+
7
+ from matchbox.client.clean import steps
8
+ from matchbox.client.clean import utils as cu
9
+
10
+
11
+ def company_name(
12
+ df: DataFrame,
13
+ column: str,
14
+ column_secondary: str = None,
15
+ stopwords: str = cu.STOPWORDS,
16
+ ) -> DataFrame:
17
+ """Standard cleaning function for company names.
18
+
19
+ * Lower case, remove punctuation & tokenise the company name into an array
20
+ * Extract tokens into: 'unusual' and 'stopwords'. Dedupe. Sort alphabetically
21
+ * Untokenise the unusual words back to a string
22
+
23
+ Args:
24
+ df: a dataframe
25
+ column: a column containing the company's main name
26
+ column_secondary: a column containing an array of the company's
27
+ secondary names
28
+ stopwords: a list of stopwords to use for this clean
29
+
30
+ Returns:
31
+ dataframe: the same as went in, but cleaned
32
+ """
33
+ remove_stopwords = partial(steps.remove_stopwords, stopwords=stopwords)
34
+
35
+ clean_primary = cu.cleaning_function(
36
+ steps.clean_punctuation,
37
+ steps.expand_abbreviations,
38
+ steps.tokenise, # returns array
39
+ remove_stopwords,
40
+ steps.list_join_to_string, # returns col
41
+ )
42
+
43
+ clean_secondary = cu.unnest_renest(clean_primary)
44
+
45
+ df = clean_primary(df, column)
46
+
47
+ if column_secondary is not None:
48
+ df = clean_secondary(df, column_secondary)
49
+
50
+ return df
51
+
52
+
53
+ def company_number(df: DataFrame, column: str) -> DataFrame:
54
+ """Remove non-numbers, and then leading zeroes.
55
+
56
+ Args:
57
+ df: a dataframe
58
+ column: a column containing a company number
59
+
60
+ Returns:
61
+ dataframe: the same as went in, but cleaned
62
+ """
63
+ clean_number = cu.cleaning_function(steps.remove_notnumbers_leadingzeroes)
64
+
65
+ df = clean_number(df, column)
66
+
67
+ return df
68
+
69
+
70
+ def postcode(df: DataFrame, column: str) -> DataFrame:
71
+ """Removes all punctuation, converts to upper, removes all spaces.
72
+
73
+ Args:
74
+ df: a dataframe
75
+ column: a column containing a postcode
76
+
77
+ Returns:
78
+ dataframe: the same as went in, but cleaned
79
+
80
+ """
81
+ clean_postcode = cu.cleaning_function(
82
+ steps.punctuation_to_spaces, steps.to_upper, steps.remove_whitespace
83
+ )
84
+
85
+ df = clean_postcode(df, column)
86
+
87
+ return df
88
+
89
+
90
+ def postcode_to_area(df: DataFrame, column: str) -> DataFrame:
91
+ """Extracts postcode area from a postcode.
92
+
93
+ Args:
94
+ df: a dataframe
95
+ column: a column containing a postcode
96
+
97
+ Returns:
98
+ dataframe: the same as went in, but cleaned
99
+ """
100
+ extract_area = cu.cleaning_function(steps.get_postcode_area)
101
+
102
+ df = extract_area(df, column)
103
+
104
+ return df
105
+
106
+
107
+ def extract_company_number_to_new(
108
+ df: DataFrame, column: str, new_column: str
109
+ ) -> DataFrame:
110
+ """Detects the Companies House CRN in a column and moves it to a new column.
111
+
112
+ Args:
113
+ df: a dataframe
114
+ column: a column containing some company numbers
115
+ new_column: the name of the column to add
116
+
117
+ Returns:
118
+ dataframe: the same as went in with a new column for CRNs
119
+ """
120
+ clean_crn = cu.cleaning_function(
121
+ steps.clean_punctuation_except_hyphens,
122
+ steps.to_upper,
123
+ steps.filter_company_number,
124
+ )
125
+
126
+ clean_crn_aliased = cu.alias(clean_crn, alias=new_column)
127
+
128
+ df = clean_crn_aliased(df, column)
129
+
130
+ return df
131
+
132
+
133
+ def extract_duns_number_to_new(
134
+ df: DataFrame, column: str, new_column: str
135
+ ) -> DataFrame:
136
+ """Detects the Dun & Bradstreet DUNS nuber in a column and moves it to a new column.
137
+
138
+ Args:
139
+ df: a dataframe
140
+ column: a column containing some DUNS numbers
141
+ new_column: the name of the column to add
142
+
143
+ Returns:
144
+ dataframe: the same as went in with a new column for DUNs numbers
145
+ """
146
+ clean_duns = cu.cleaning_function(
147
+ steps.clean_punctuation_except_hyphens, steps.to_upper, steps.filter_duns_number
148
+ )
149
+
150
+ clean_duns_aliased = cu.alias(clean_duns, alias=new_column)
151
+
152
+ df = clean_duns_aliased(df, column)
153
+
154
+ return df
155
+
156
+
157
+ def extract_cdms_number_to_new(
158
+ df: DataFrame, column: str, new_column: str
159
+ ) -> DataFrame:
160
+ """Detects the CDMS nuber in a column and moves it to a new column.
161
+
162
+ Args:
163
+ df: a dataframe
164
+ column: a column containing some CDMS numbers
165
+ new_column: the name of the column to add
166
+
167
+ Returns:
168
+ dataframe: the same as went in with a new column for CDMS numbers
169
+ """
170
+ clean_cdms = cu.cleaning_function(
171
+ steps.clean_punctuation_except_hyphens, steps.to_upper, steps.filter_cdms_number
172
+ )
173
+
174
+ clean_cdms_aliased = cu.alias(clean_cdms, alias=new_column)
175
+
176
+ df = clean_cdms_aliased(df, column)
177
+
178
+ return df
179
+
180
+
181
+ def drop(df: DataFrame, column: str) -> DataFrame:
182
+ """Drops the column from the dataframe.
183
+
184
+ Args:
185
+ df: a dataframe
186
+ column: a column
187
+
188
+ Returns:
189
+ dataframe: the same as went in without the column
190
+ """
191
+ return df.drop(columns=[column])
@@ -0,0 +1,71 @@
1
+ """Low-level components of default cleaning functions."""
2
+
3
+ from matchbox.client.clean.steps.clean_basic import (
4
+ array_except,
5
+ array_intersect,
6
+ clean_punctuation,
7
+ clean_punctuation_except_hyphens,
8
+ dedupe_and_sort,
9
+ expand_abbreviations,
10
+ filter_cdms_number,
11
+ filter_company_number,
12
+ filter_duns_number,
13
+ get_digits_only,
14
+ get_low_freq_char_sig,
15
+ get_postcode_area,
16
+ list_join_to_string,
17
+ periods_to_nothing,
18
+ punctuation_to_spaces,
19
+ regex_extract_list_of_strings,
20
+ regex_remove_list_of_strings,
21
+ remove_notnumbers_leadingzeroes,
22
+ remove_stopwords,
23
+ remove_whitespace,
24
+ to_lower,
25
+ to_upper,
26
+ tokenise,
27
+ trim,
28
+ )
29
+ from matchbox.client.clean.steps.clean_basic_original import (
30
+ cms_original_clean_cdms_id,
31
+ cms_original_clean_ch_id,
32
+ cms_original_clean_company_name_ch,
33
+ cms_original_clean_company_name_general,
34
+ cms_original_clean_email,
35
+ cms_original_clean_postcode,
36
+ )
37
+
38
+ __all__ = (
39
+ # Basic steps
40
+ "array_except",
41
+ "array_intersect",
42
+ "periods_to_nothing",
43
+ "punctuation_to_spaces",
44
+ "clean_punctuation",
45
+ "clean_punctuation_except_hyphens",
46
+ "dedupe_and_sort",
47
+ "expand_abbreviations",
48
+ "filter_cdms_number",
49
+ "filter_company_number",
50
+ "filter_duns_number",
51
+ "get_digits_only",
52
+ "get_low_freq_char_sig",
53
+ "get_postcode_area",
54
+ "list_join_to_string",
55
+ "regex_extract_list_of_strings",
56
+ "regex_remove_list_of_strings",
57
+ "remove_notnumbers_leadingzeroes",
58
+ "remove_stopwords",
59
+ "remove_whitespace",
60
+ "to_lower",
61
+ "to_upper",
62
+ "tokenise",
63
+ "trim",
64
+ # Original CMS steps
65
+ "cms_original_clean_cdms_id",
66
+ "cms_original_clean_ch_id",
67
+ "cms_original_clean_company_name_ch",
68
+ "cms_original_clean_company_name_general",
69
+ "cms_original_clean_email",
70
+ "cms_original_clean_postcode",
71
+ )