idscrub 1.1.2__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
test/conftest.py CHANGED
@@ -20,3 +20,39 @@ def scrub_object_all():
20
20
  "My number is +441111111111 and I live at AA11 1AA.",
21
21
  ]
22
22
  )
23
+
24
+
25
+ @pytest.fixture
26
+ def idents():
27
+ return [
28
+ IDScrub.IDEnt(
29
+ text_id="A",
30
+ text="The quick brown fox jumps over the lazy dog.",
31
+ start=10,
32
+ end=19,
33
+ label="animal",
34
+ replacement="[ANIMAL]",
35
+ priority=0.92,
36
+ source="custom_regex",
37
+ ),
38
+ IDScrub.IDEnt(
39
+ text_id="A",
40
+ text="My phone number is 123-456-7890.",
41
+ start=19,
42
+ end=31,
43
+ label="phone_number",
44
+ replacement="[PHONE]",
45
+ priority=0.76,
46
+ source="google",
47
+ ),
48
+ IDScrub.IDEnt(
49
+ text_id="B",
50
+ text="Email me at example@example.com.",
51
+ start=12,
52
+ end=31,
53
+ label="email",
54
+ replacement="[EMAIL]",
55
+ priority=0.88,
56
+ source="email",
57
+ ),
58
+ ]
test/test_dataframe.py CHANGED
@@ -20,7 +20,7 @@ def test_dataframe_outputs():
20
20
  }
21
21
  )
22
22
 
23
- scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", scrub_methods=["all"])
23
+ scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID")
24
24
 
25
25
  expected_scrubbed_df = pd.DataFrame(
26
26
  {
@@ -43,7 +43,6 @@ def test_dataframe_outputs():
43
43
  "person": [["Darcy", "Elizabeth"], ["Bennet"], None, ["Mick Jagger", "David Bowie"]],
44
44
  "title": [["Mr"], ["Mr"], None, None],
45
45
  "email_address": [None, None, ["freddie-mercury@queen.com"], None],
46
- "url": [None, None, ["queen.com"], None],
47
46
  "uk_postcode": [None, None, ["SW1A 2AA"], ["SW1A 2WH"]],
48
47
  }
49
48
  )
@@ -67,9 +66,7 @@ def test_dataframe_exclude():
67
66
  }
68
67
  )
69
68
 
70
- scrubbed_df, scrubbed_data = IDScrub.dataframe(
71
- df=df, id_col="ID", exclude_cols=["Fake book"], scrub_methods=["all"]
72
- )
69
+ scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", exclude_cols=["Fake book"])
73
70
 
74
71
  expected_scrubbed_df = pd.DataFrame(
75
72
  {
@@ -113,7 +110,7 @@ def test_dataframe_scrub_methods():
113
110
  }
114
111
  )
115
112
 
116
- scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", scrub_methods=["titles"])
113
+ scrubbed_df, scrubbed_data = IDScrub.dataframe(df=df, id_col="ID", pipeline=[{"method": "titles"}])
117
114
 
118
115
  expected_scrubbed_df = pd.DataFrame(
119
116
  {
@@ -141,7 +138,7 @@ def test_dataframe_scrub_methods():
141
138
  assert_frame_equal(scrubbed_data, expected_scrubbed_data)
142
139
 
143
140
 
144
- def test_dataframe_id_col():
141
+ def test_dataframe_errors():
145
142
  df = pd.DataFrame(
146
143
  {
147
144
  "ID": [1, 2],
@@ -156,5 +153,8 @@ def test_dataframe_id_col():
156
153
  }
157
154
  )
158
155
 
159
- with pytest.raises(AssertionError):
156
+ with pytest.raises(ValueError):
160
157
  IDScrub.dataframe(df=df, id_col="ID_not_present")
158
+
159
+ with pytest.raises(TypeError):
160
+ IDScrub.dataframe(df=1, id_col="ID_not_present")
test/test_errors.py ADDED
@@ -0,0 +1,32 @@
1
+ import pytest
2
+ from idscrub import IDScrub
3
+
4
+
5
+ def test_scrub_input():
6
+ with pytest.raises(TypeError):
7
+ IDScrub(texts=[123])
8
+ with pytest.raises(TypeError):
9
+ IDScrub(texts=[1, 2, 3])
10
+ with pytest.raises(TypeError):
11
+ IDScrub(texts=[1.0, 2.0, 3.0])
12
+ with pytest.raises(TypeError):
13
+ IDScrub(texts="not_a_list")
14
+
15
+
16
+ def test_scrub_input_text_ids():
17
+ with pytest.raises(ValueError):
18
+ IDScrub(texts=["Hello"], text_ids=[1, 2])
19
+
20
+
21
+ def test_replacement_error():
22
+ with pytest.raises(TypeError):
23
+ IDScrub(texts=["Hello"], text_ids=[1], replacement=1)
24
+ with pytest.raises(TypeError):
25
+ IDScrub(texts=["Hello"], text_ids=[1], replacement=1.0)
26
+ with pytest.raises(TypeError):
27
+ IDScrub(texts=["Hello"], text_ids=[1], replacement=["ok"])
28
+
29
+
30
+ def test_scrub_pipeline_error(scrub_object):
31
+ with pytest.raises(TypeError):
32
+ scrub_object.scrub(pipeline={"method": "spacy_entities"})
test/test_exclude.py ADDED
@@ -0,0 +1,22 @@
1
+ from idscrub import IDScrub
2
+
3
+
4
+ def test_exclude():
5
+ scrub = IDScrub(
6
+ [
7
+ "Our names are Hamish McDonald, L. Salah, and Elena Suárez.",
8
+ ],
9
+ exclude=["Hamish McDonald", "L. Salah"],
10
+ )
11
+ scrubbed = scrub.scrub(
12
+ pipeline=[{"method": "spacy_entities"}],
13
+ )
14
+
15
+ assert scrubbed == [
16
+ "Our names are Hamish McDonald, L. Salah, and [PERSON].",
17
+ ]
18
+
19
+ assert scrub.idents_all[0].text == "Hamish McDonald"
20
+ assert scrub.idents_all[1].text == "L. Salah"
21
+
22
+ assert [ident.text for ident in scrub.idents] not in ["Hamish McDonald", "L. Salah"]
test/test_group.py ADDED
@@ -0,0 +1,9 @@
1
+ from idscrub import IDScrub
2
+
3
+
4
+ def test_group_idents(idents):
5
+ scrub = IDScrub(texts=[])
6
+ entities_grouped = scrub.group_idents(idents)
7
+
8
+ assert len(entities_grouped) == 2
9
+ assert list(entities_grouped.keys()) == ["A", "B"]
test/test_huggingface.py CHANGED
@@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal
6
6
 
7
7
  def test_huggingface():
8
8
  scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
9
- scrubbed = scrub.huggingface_entities()
9
+ scrubbed = scrub.scrub(pipeline=[{"method": "huggingface_entities"}])
10
10
  assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON]."]
11
11
 
12
12
 
@@ -14,12 +14,12 @@ def test_huggingface_error():
14
14
  scrub = IDScrub(texts=["Our names are Hamish McDonald, L. Salah, and Elena Suárez."])
15
15
 
16
16
  with pytest.raises(OSError):
17
- scrub.huggingface_entities(hf_model_path="not_a_path")
17
+ scrub.scrub(pipeline=[{"method": "huggingface_entities", "hf_model_path": "not_a_model"}])
18
18
 
19
19
 
20
20
  def test_huggingface_empty():
21
21
  scrub = IDScrub([" ", "John Smith", ""])
22
- scrubbed = scrub.huggingface_entities()
22
+ scrubbed = scrub.scrub(pipeline=[{"method": "huggingface_entities"}])
23
23
 
24
24
  assert scrubbed == [" ", "[PERSON]", ""]
25
25
  assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
test/test_id.py CHANGED
@@ -2,23 +2,24 @@ from idscrub import IDScrub
2
2
 
3
3
 
4
4
  def test_id_ints():
5
- scrub = IDScrub(texts=["clement_attlee@gmail.com"] * 10, text_ids=range(100, 110), text_id_name="PM")
6
- scrub.email_addresses()
5
+ scrub = IDScrub(texts=["clement_attlee@testemail.com"] * 10, text_ids=range(100, 110), text_id_name="PM")
6
+ scrub.scrub(pipeline=[{"method": "email_addresses"}])
7
7
  assert scrub.get_scrubbed_data()["PM"].min() == 100
8
8
  assert scrub.get_scrubbed_data()["PM"].max() == 109
9
9
  assert scrub.get_scrubbed_data()["PM"].to_list() == [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
10
10
 
11
11
 
12
12
  def test_id_strs():
13
- scrub = IDScrub(texts=["clement_attlee@gmail.com"] * 2, text_ids=["random", "minister"], text_id_name="PM")
14
- scrub.email_addresses()
13
+ scrub = IDScrub(texts=["clement_attlee@testemail.com"] * 2, text_ids=["random", "minister"], text_id_name="PM")
14
+ scrub.scrub(pipeline=[{"method": "email_addresses"}])
15
15
  assert scrub.get_scrubbed_data()["PM"][0] == "random"
16
16
  assert scrub.get_scrubbed_data()["PM"][1] == "minister"
17
17
 
18
18
 
19
19
  def test_multiple():
20
- scrub = IDScrub(texts=["clement_attlee@gmail.com", "SW1A 2AA"] * 10, text_ids=range(100, 120), text_id_name="PM")
21
- scrub.email_addresses()
22
- scrub.uk_postcodes()
20
+ scrub = IDScrub(
21
+ texts=["clement_attlee@testemail.com", "SW1A 2AA"] * 10, text_ids=range(100, 120), text_id_name="PM"
22
+ )
23
+ scrub.scrub(pipeline=[{"method": "email_addresses"}, {"method": "uk_postcodes"}])
23
24
  assert scrub.get_scrubbed_data()["PM"].min() == 100
24
25
  assert scrub.get_scrubbed_data()["PM"].max() == 119
test/test_label.py CHANGED
@@ -1,9 +1,14 @@
1
1
  def test_label(scrub_object_all):
2
- for i, scrub_method in enumerate(
3
- ["uk_postcodes", "email_addresses", "ip_addresses", "uk_phone_numbers", "titles", "handles"]
4
- ):
5
- method = getattr(scrub_object_all, scrub_method)
6
- method(label="test")
2
+ scrub_object_all.scrub(
3
+ pipeline=[
4
+ {"method": "uk_postcodes", "label": "test"},
5
+ {"method": "email_addresses", "label": "test"},
6
+ {"method": "ip_addresses", "label": "test"},
7
+ {"method": "uk_phone_numbers", "label": "test"},
8
+ {"method": "titles", "label": "test"},
9
+ {"method": "handles", "label": "test"},
10
+ ]
11
+ )
7
12
 
8
13
  df = scrub_object_all.get_scrubbed_data()
9
14
 
@@ -11,7 +16,17 @@ def test_label(scrub_object_all):
11
16
 
12
17
 
13
18
  def test_regex_label(scrub_object_all):
14
- scrub_object_all.custom_regex(custom_regex_patterns=[r"number", r"live"], labels=["regex_number", "regex_live"])
19
+ scrub_object_all.scrub(
20
+ pipeline=[
21
+ {
22
+ "method": "custom_regex",
23
+ "patterns": {
24
+ "number": {"pattern": r"number", "replacement": "[REDACTED]", "priority": 0.5},
25
+ "live": {"pattern": r"live", "replacement": "[REDACTED]"},
26
+ },
27
+ }
28
+ ]
29
+ )
15
30
  df = scrub_object_all.get_scrubbed_data()
16
31
 
17
- assert df.columns.to_list() == ["text_id", "regex_number", "regex_live"]
32
+ assert df.columns.to_list() == ["text_id", "number", "live"]
test/test_overlap.py ADDED
@@ -0,0 +1,86 @@
1
+ from idscrub import IDScrub
2
+
3
+
4
+ def test_overlap():
5
+ scrub = IDScrub(texts=["My email is fakeperson@fakeemail.com"])
6
+ scrubbed = scrub.scrub(
7
+ pipeline=[{"method": "handles", "priority": 0.1}, {"method": "email_addresses", "priority": 1.0}]
8
+ )
9
+ assert max([ident.priority for ident in scrub.idents_all]) == 1.0
10
+ assert scrub.idents_all == [
11
+ IDScrub.IDEnt(
12
+ text_id=1,
13
+ text="@fakeemail.com",
14
+ start=22,
15
+ end=36,
16
+ label="handle",
17
+ replacement="[HANDLE]",
18
+ priority=0.1,
19
+ source="regex",
20
+ ),
21
+ IDScrub.IDEnt(
22
+ text_id=1,
23
+ text="fakeperson@fakeemail.com",
24
+ start=12,
25
+ end=36,
26
+ label="email_address",
27
+ replacement="[EMAIL_ADDRESS]",
28
+ priority=1.0,
29
+ source="regex",
30
+ ),
31
+ ]
32
+ assert scrub.idents == [
33
+ IDScrub.IDEnt(
34
+ text_id=1,
35
+ text="fakeperson@fakeemail.com",
36
+ start=12,
37
+ end=36,
38
+ label="email_address",
39
+ replacement="[EMAIL_ADDRESS]",
40
+ priority=1.0,
41
+ source="regex",
42
+ )
43
+ ]
44
+ assert scrubbed == ["My email is [EMAIL_ADDRESS]"]
45
+
46
+
47
+ def test_overlap_default():
48
+ scrub = IDScrub(texts=["I am @John Smith"])
49
+ scrubbed = scrub.scrub(pipeline=[{"method": "spacy_entities", "entity_types": ["PERSON"]}, {"method": "handles"}])
50
+ assert max([ident.priority for ident in scrub.idents_all]) == 1.0
51
+ assert scrub.idents_all == [
52
+ IDScrub.IDEnt(
53
+ text_id=1,
54
+ text="@John Smith",
55
+ start=5,
56
+ end=16,
57
+ label="person",
58
+ replacement="[PERSON]",
59
+ priority=1.0,
60
+ source="spacy",
61
+ ),
62
+ IDScrub.IDEnt(
63
+ text_id=1,
64
+ text="@John",
65
+ start=5,
66
+ end=10,
67
+ label="handle",
68
+ replacement="[HANDLE]",
69
+ priority=0.4,
70
+ source="regex",
71
+ ),
72
+ ]
73
+ assert scrub.idents == [
74
+ IDScrub.IDEnt(
75
+ text_id=1,
76
+ text="@John Smith",
77
+ start=5,
78
+ end=16,
79
+ label="person",
80
+ replacement="[PERSON]",
81
+ priority=1.0,
82
+ source="spacy",
83
+ )
84
+ ]
85
+
86
+ assert scrubbed == ["I am [PERSON]"]
test/test_phonenumbers.py CHANGED
@@ -3,11 +3,11 @@ from idscrub import IDScrub
3
3
 
4
4
  def test_google_phone_numbers_gb():
5
5
  scrub = IDScrub(texts=["My phone number is +441234567891! My old phone number is 01475 123456."])
6
- scrubbed = scrub.google_phone_numbers(region="GB")
6
+ scrubbed = scrub.scrub(pipeline=[{"method": "google_phone_numbers"}])
7
7
  assert scrubbed == ["My phone number is [PHONENO]! My old phone number is [PHONENO]."]
8
8
 
9
9
 
10
10
  def test_google_phone_numbers_us():
11
11
  scrub = IDScrub(texts=["My US phone number is +1-718-222-2222! My old phone number is 12124567890."])
12
- scrubbed = scrub.google_phone_numbers(region="US")
12
+ scrubbed = scrub.scrub(pipeline=[{"method": "google_phone_numbers", "region": "US"}])
13
13
  assert scrubbed == ["My US phone number is [PHONENO]! My old phone number is [PHONENO]."]
test/test_presidio.py CHANGED
@@ -8,20 +8,27 @@ def test_presidio():
8
8
  scrub = IDScrub(
9
9
  ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
10
10
  )
11
- scrubbed_texts = scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
11
+ scrubbed = scrub.scrub(pipeline=[{"method": "presidio_entities"}])
12
12
 
13
- assert scrubbed_texts == ["Our names are [PERSON], [PERSON], and [PERSON].", "My IBAN code is [IBAN_CODE]."]
13
+ assert scrubbed == ["Our names are [PERSON], [PERSON], and [PERSON].", "My IBAN code is [IBAN_CODE]."]
14
14
 
15
15
 
16
16
  def test_presidio_map():
17
17
  scrub = IDScrub(
18
18
  ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
19
19
  )
20
- scrubbed_texts = scrub.presidio_entities(
21
- entities=["PERSON", "IBAN_CODE"], replacement_map={"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"}
20
+
21
+ scrubbed = scrub.scrub(
22
+ pipeline=[
23
+ {
24
+ "method": "presidio_entities",
25
+ "entity_types": ["PERSON", "IBAN_CODE"],
26
+ "replacement_map": {"PERSON": "[PHELLO]", "IBAN_CODE": "[IHELLO]"},
27
+ }
28
+ ]
22
29
  )
23
30
 
24
- assert scrubbed_texts == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My IBAN code is [IHELLO]."]
31
+ assert scrubbed == ["Our names are [PHELLO], [PHELLO], and [PHELLO].", "My IBAN code is [IHELLO]."]
25
32
 
26
33
 
27
34
  def test_presidio_get_data():
@@ -29,7 +36,7 @@ def test_presidio_get_data():
29
36
  ["Our names are Hamish McDonald, L. Salah, and Elena Suárez.", "My IBAN code is GB91BKEN10000041610008."]
30
37
  )
31
38
 
32
- scrub.presidio_entities(entities=["PERSON", "IBAN_CODE"])
39
+ scrub.scrub(pipeline=[{"method": "presidio_entities"}])
33
40
 
34
41
  df = scrub.get_scrubbed_data()
35
42
 
test/test_regex.py CHANGED
@@ -1,25 +1,25 @@
1
- import re
2
-
3
1
  from idscrub import IDScrub
4
2
 
5
3
 
6
4
  def test_email_addresses():
7
5
  scrub = IDScrub(
8
- texts=["Send me an email at jim@gmail.com or at marie-9999@randomemail.co.uk or at hello_world@john-smith.com."]
6
+ texts=[
7
+ "Send me an email at jim@testemail.com or at marie-9999@randomemail.co.uk or at hello_world@john-smith.com."
8
+ ]
9
9
  )
10
- scrubbed = scrub.email_addresses()
10
+ scrubbed = scrub.scrub(pipeline=[{"method": "email_addresses"}])
11
11
  assert scrubbed == ["Send me an email at [EMAIL_ADDRESS] or at [EMAIL_ADDRESS] or at [EMAIL_ADDRESS]."]
12
12
 
13
13
 
14
14
  def test_ip_addresses():
15
15
  scrub = IDScrub(texts=["This has been sent to 8.8.8.8 and requested by 192.0.2.1."])
16
- scrubbed = scrub.ip_addresses()
16
+ scrubbed = scrub.scrub(pipeline=[{"method": "ip_addresses"}])
17
17
  assert scrubbed == ["This has been sent to [IPADDRESS] and requested by [IPADDRESS]."]
18
18
 
19
19
 
20
20
  def test_uk_postcodes():
21
21
  scrub = IDScrub(texts=["I live at A11 1AA. My friend lives at KA308JB. The Prime Minister lives at SW1A 2AA."])
22
- scrubbed = scrub.uk_postcodes()
22
+ scrubbed = scrub.scrub(pipeline=[{"method": "uk_postcodes"}])
23
23
  assert scrubbed == ["I live at [POSTCODE]. My friend lives at [POSTCODE]. The Prime Minister lives at [POSTCODE]."]
24
24
 
25
25
 
@@ -30,7 +30,7 @@ def test_titles_not_strict():
30
30
  "I am here on behalf of Ms Austen, General Eisenhower, and Captain Jack Sparrow.",
31
31
  ]
32
32
  )
33
- scrubbed = scrub.titles()
33
+ scrubbed = scrub.scrub(pipeline=[{"method": "titles"}])
34
34
  assert scrubbed == [
35
35
  "Hello [TITLE]. Smith! I am [TITLE] Patel",
36
36
  "I am here on behalf of [TITLE] Austen, General Eisenhower, and [TITLE] Jack Sparrow.",
@@ -44,7 +44,7 @@ def test_titles_strict():
44
44
  "I am here on behalf of Ms Austen, General Eisenhower, and Captain Jack Sparrow.",
45
45
  ]
46
46
  )
47
- scrubbed = scrub.titles(strict=True)
47
+ scrubbed = scrub.scrub(pipeline=[{"method": "titles", "strict": True}])
48
48
  assert scrubbed == [
49
49
  "Hello [TITLE]. Smith! I am [TITLE] Patel",
50
50
  "I am here on behalf of [TITLE] Austen, [TITLE] Eisenhower, and [TITLE] Jack Sparrow.",
@@ -53,32 +53,52 @@ def test_titles_strict():
53
53
 
54
54
  def test_uk_phone_numbers():
55
55
  scrub = IDScrub(texts=["My phone number is +441234567891! My old phone number is 01111 123456."])
56
- scrubbed = scrub.uk_phone_numbers()
56
+ scrubbed = scrub.scrub(pipeline=[{"method": "uk_phone_numbers"}])
57
57
  assert scrubbed == ["My phone number is [PHONENO]! My old phone number is [PHONENO]."]
58
58
 
59
59
 
60
60
  def test_handles():
61
61
  scrub = IDScrub(texts=["Our usernames are @HenrikLarsson, @Jimmy_Johnstone, @Nakamura-67 and @Aidan_McGeady_46."])
62
- scrubbed = scrub.handles()
62
+ scrubbed = scrub.scrub(pipeline=[{"method": "handles"}])
63
63
  assert scrubbed == ["Our usernames are [HANDLE], [HANDLE], [HANDLE] and [HANDLE]."]
64
64
 
65
65
 
66
+ def test_urls():
67
+ scrub = IDScrub(
68
+ [
69
+ "www.example.co.uk",
70
+ "https://example.com",
71
+ "http://sub.domain.co.uk/path?query=1&x=2",
72
+ "www.example.org/page/index.html",
73
+ "https://example.com:8080/path/to/resource#anchor",
74
+ "www.test-site123.net/some/path?with=paramsexample.comexample.co.uk/home",
75
+ ]
76
+ )
77
+
78
+ scrubbed = scrub.scrub(pipeline=[{"method": "urls"}])
79
+
80
+ assert scrubbed == ["[URL]", "[URL]", "[URL]", "[URL]", "[URL]", "[URL]"]
81
+
82
+
66
83
  def test_uk_addresses():
67
84
  scrub = IDScrub(
68
85
  [
69
86
  "221B Baker Street",
70
87
  "12 high road",
71
88
  "Flat 3B, 47 King's Court",
72
- "1214 High Street",
89
+ "12-14 High Street",
73
90
  "5a-7a Church Lane",
74
91
  "1/2 Main Street",
75
92
  "10 St John’s Rd",
76
93
  "33 Queen-Anne Walk",
77
94
  "8 Deansgate Ct",
78
- ]
95
+ "10 Downing Street",
96
+ "10, Downing Street",
97
+ ],
79
98
  )
80
99
 
81
- scrubbed = scrub.uk_addresses()
100
+ scrubbed = scrub.scrub(pipeline=[{"method": "uk_addresses"}])
101
+
82
102
  assert scrubbed == [
83
103
  "[ADDRESS]",
84
104
  "[ADDRESS]",
@@ -89,6 +109,8 @@ def test_uk_addresses():
89
109
  "[ADDRESS]",
90
110
  "[ADDRESS]",
91
111
  "[ADDRESS]",
112
+ "[ADDRESS]",
113
+ "[ADDRESS]",
92
114
  ]
93
115
 
94
116
  negative_tests = [
@@ -105,65 +127,94 @@ def test_uk_addresses():
105
127
 
106
128
  scrub = IDScrub(negative_tests)
107
129
 
108
- scrubbed = scrub.uk_addresses()
130
+ scrubbed = scrub.scrub(pipeline=[{"method": "uk_addresses"}])
109
131
  assert scrubbed == negative_tests
110
132
 
111
133
 
112
- def test_claimants():
134
+ def test_custom_regex():
135
+ scrub = IDScrub(texts=[])
136
+
137
+ scrubbed_idents = scrub.custom_regex(
138
+ texts=["It was the best of times, it was the worst of times"],
139
+ text_ids=["A"],
140
+ patterns={
141
+ "times": {"pattern": r"times", "replacement": "[DICKENS]", "priority": 0.5},
142
+ "worst": {"pattern": r"worst", "replacement": "[WORST]", "priority": 0.8},
143
+ },
144
+ )
145
+
146
+ assert scrubbed_idents == [
147
+ IDScrub.IDEnt(
148
+ text_id="A",
149
+ text="times",
150
+ start=19,
151
+ end=24,
152
+ label="times",
153
+ replacement="[DICKENS]",
154
+ priority=0.5,
155
+ source="custom_regex",
156
+ ),
157
+ IDScrub.IDEnt(
158
+ text_id="A",
159
+ text="times",
160
+ start=46,
161
+ end=51,
162
+ label="times",
163
+ replacement="[DICKENS]",
164
+ priority=0.5,
165
+ source="custom_regex",
166
+ ),
167
+ IDScrub.IDEnt(
168
+ text_id="A",
169
+ text="worst",
170
+ start=37,
171
+ end=42,
172
+ label="worst",
173
+ replacement="[WORST]",
174
+ priority=0.8,
175
+ source="custom_regex",
176
+ ),
177
+ ]
178
+
113
179
  scrub = IDScrub(
114
180
  texts=[
115
- "This is legal text. Claimant: John Smith Respondents: Jill Hill.",
116
- "Claimant: J Smith Respondents: Jill Hill. J Smith is the respondent.",
181
+ "It was the best of times, it was the worst of times",
117
182
  ]
118
183
  )
119
- scrubbed = scrub.claimants()
120
- assert scrubbed == [
121
- "This is legal text. Claimant: [CLAIMANT] Respondents: Jill Hill.",
122
- "Claimant: [CLAIMANT] Respondents: Jill Hill. [CLAIMANT] is the respondent.",
123
- ]
124
-
125
-
126
- def test_custom_regex():
127
- scrub = IDScrub(texts=["It was the best of times, it was the worst of times"])
128
- scrubbed = scrub.custom_regex(custom_regex_patterns=[r"times"])
129
- assert scrubbed == ["It was the best of [REDACTED], it was the worst of [REDACTED]"]
130
184
 
131
- scrub = IDScrub(texts=["It was the best of times, it was the worst of times"])
132
- scrubbed = scrub.custom_regex(
133
- custom_regex_patterns=[r"times", "worst"], custom_replacement_texts=["[DICKENS]", "[WORST]"]
185
+ scrubbed_text = scrub.scrub(
186
+ pipeline=[
187
+ {
188
+ "method": "custom_regex",
189
+ "patterns": {
190
+ "times": {"pattern": r"times", "replacement": "[DICKENS]", "priority": 0.5},
191
+ "worst": {"pattern": r"worst", "replacement": "[WORST]", "priority": 0.5},
192
+ },
193
+ }
194
+ ]
134
195
  )
135
- assert scrubbed == ["It was the best of [DICKENS], it was the [WORST] of [DICKENS]"]
136
-
137
-
138
- def test_scrub_and_collect():
139
- scrub = IDScrub()
140
- text = "Hello Muhammad and Jack."
141
- pattern = r"\bMuhammad|Jack\b"
142
- replacement = "[NAME]"
143
- label = "custom_regex"
144
- i = 1
145
-
146
- def replacer(match):
147
- return scrub.scrub_and_collect(match, text, replacement, i, label)
148
196
 
149
- scrubbed = re.sub(pattern, replacer, text)
150
-
151
- assert scrubbed == "Hello [NAME] and [NAME]."
152
- assert scrub.scrubbed_data == [
153
- {"text_id": 1, "custom_regex": "Muhammad"},
154
- {"text_id": 1, "custom_regex": "Jack"},
155
- ]
197
+ assert scrubbed_text == ["It was the best of [DICKENS], it was the [WORST] of [DICKENS]"]
156
198
 
157
199
 
158
200
  def test_remove_regex():
159
- scrub = IDScrub(texts=["Hi! My name is Clement Atlee!", "I am Harold Wilson."])
201
+ texts = ["Hi! My name is Clement Atlee!"]
202
+ text_ids = ["UK"]
203
+ scrub = IDScrub([])
160
204
  label = "regex_names"
161
205
  pattern = r"Clement Atlee|Harold Wilson"
162
- replacement_text = "[PM]"
163
- scrubbed = scrub.scrub_regex(pattern, replacement_text, label)
206
+ replacement = "[PM]"
207
+ priority = 0.5
208
+ idents = scrub.find_regex(
209
+ texts=texts, text_ids=text_ids, pattern=pattern, replacement=replacement, label=label, priority=priority
210
+ )
164
211
 
165
- assert scrubbed == ["Hi! My name is [PM]!", "I am [PM]."]
166
- assert scrub.scrubbed_data == [
167
- {"text_id": 1, "regex_names": "Clement Atlee"},
168
- {"text_id": 2, "regex_names": "Harold Wilson"},
169
- ]
212
+ assert len(idents) == 1
213
+ assert idents[0].text_id == "UK"
214
+ assert idents[0].text == "Clement Atlee"
215
+ assert idents[0].start == 15
216
+ assert idents[0].end == 28
217
+ assert idents[0].label == "regex_names"
218
+ assert idents[0].replacement == "[PM]"
219
+ assert idents[0].priority == 0.5
220
+ assert idents[0].source == "regex"