idscrub 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
idscrub/scrub.py CHANGED
@@ -634,7 +634,7 @@ class IDScrub:
634
634
  """
635
635
 
636
636
  nlp = self.get_spacy_model(model_name)
637
- stripped_texts = [s.strip() if s.isspace() else s for s in texts]
637
+ stripped_texts = ["" if s.strip() == "" else s for s in texts]
638
638
  docs = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
639
639
 
640
640
  idents = []
@@ -825,9 +825,11 @@ class IDScrub:
825
825
 
826
826
  analyzer = AnalyzerEngine(nlp_engine=loaded_nlp_engine)
827
827
 
828
+ stripped_texts = ["" if s.strip() == "" else s for s in texts]
829
+
828
830
  idents = []
829
831
 
830
- for text, text_id in zip(texts, text_ids):
832
+ for text, text_id in zip(stripped_texts, text_ids):
831
833
  results = analyzer.analyze(text=text, language="en", entities=entity_types)
832
834
  for res in results:
833
835
  if res.entity_type not in entity_types:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: idscrub
3
- Version: 2.0.0
3
+ Version: 2.0.1
4
4
  Author: Department for Business and Trade
5
5
  Classifier: Development Status :: 3 - Alpha
6
6
  Requires-Python: >=3.12
@@ -1,7 +1,7 @@
1
1
  idscrub/__init__.py,sha256=cRugJv27q1q--bl-VNLpfiScJb_ROlUxyLFhaF55S1w,38
2
2
  idscrub/locations.py,sha256=7fMNOcGMYe7sX8TrfhMW6oYGAlc1WVYVQKQbpxE3pqo,217
3
- idscrub/scrub.py,sha256=ow5gDejdchXxi3iN1vEqlCaJGGkaEnvFJ8nAau6w9XI,45308
4
- idscrub-2.0.0.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
3
+ idscrub/scrub.py,sha256=38r5pbsXkl3s_XLlstRbUm4e59zrN_CUwY9WtMnlcC0,45386
4
+ idscrub-2.0.1.dist-info/licenses/LICENSE,sha256=JJnuf10NSx7YXglte1oH_N9ZP3AcWR_Y8irvQb_wnsg,1090
5
5
  notebooks/basic_usage.ipynb,sha256=kfCgdN8DnjIUd9wZs93mpLKMT5NA3E8ty8a6mQUAZGo,40195
6
6
  test/conftest.py,sha256=CD_fYlo-qjkDgsW-ZRMZjW5Famqpt8fzN2HOcJ2MBDU,1450
7
7
  test/test_dataframe.py,sha256=MWHQVJ_lCKLR86lnGGz9Nlke-pPQwvim11Cs5INxmh8,6395
@@ -13,12 +13,12 @@ test/test_id.py,sha256=z2Za6sk-Km-r2FN6i03_q2IY5I21OkwfOtc4p9sRvJg,1108
13
13
  test/test_label.py,sha256=uiUF72nZFt5GWE7HWO2ura3XU4hSKfe6ZTOyc7oRyzg,1053
14
14
  test/test_overlap.py,sha256=kM6jePx94evk-VRkMzeH_UiuYPR7FFe63oBgLsvfvZw,2380
15
15
  test/test_phonenumbers.py,sha256=LqAUzSYdYoQQyO_mAHYQkkRCgh4uArX4oxxKVhMi548,661
16
- test/test_presidio.py,sha256=Y6kgPOZ_ie3shVThbNfJ2i7w1yifS5ukI4eAo1Lgm6k,1658
16
+ test/test_presidio.py,sha256=qm5zrqjK4Qdg0Ha7Oo63s_BvMmZvC3lgixlHTAZV0G4,1989
17
17
  test/test_regex.py,sha256=Z_D8ttPYnRCr4gl13W9IEb7vS6QtWMReAc9Rufbtv50,6789
18
18
  test/test_scrub.py,sha256=aelzLpF2S_5F754VcZtWeEzQIcUZ0ex0m6A8mTAnWQg,1591
19
19
  test/test_scrub_text.py,sha256=WadP66U3jrSqXlbUrIrguDYSfhRKFJW73lR6eudC22o,605
20
- test/test_spacy.py,sha256=5wIPnHirs6fx1Msz9IDDF8dxyh3t1asWaFEvSXeiBvU,1910
21
- idscrub-2.0.0.dist-info/METADATA,sha256=ARod5-Azd--n-lWnZ3yHF4jt9MSjpPrcnMePzmpp8BM,8537
22
- idscrub-2.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
23
- idscrub-2.0.0.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
24
- idscrub-2.0.0.dist-info/RECORD,,
20
+ test/test_spacy.py,sha256=sf3dtpdZibUVc3BG5v7gGaADusNDek_loCDLj-_mQTc,1914
21
+ idscrub-2.0.1.dist-info/METADATA,sha256=WYIHhgM7Ln7B62i0-A-0MOEcMhrqanorgTHMf1jAms8,8537
22
+ idscrub-2.0.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
23
+ idscrub-2.0.1.dist-info/top_level.txt,sha256=D4EEodXGCjGiX35ObiBTmjjBAdouN-eCvH-LezGGtks,23
24
+ idscrub-2.0.1.dist-info/RECORD,,
test/test_presidio.py CHANGED
@@ -49,3 +49,11 @@ def test_presidio_get_data():
49
49
  )
50
50
 
51
51
  assert_frame_equal(df, expected_df)
52
+
53
+
54
+ def test_presidio_empty():
55
+ scrub = IDScrub([" ", " John Smith", ""])
56
+ scrubbed = scrub.scrub(pipeline=[{"method": "presidio_entities", "entity_types": ["PERSON"]}])
57
+
58
+ assert scrubbed == [" ", " [PERSON]", ""]
59
+ assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
test/test_spacy.py CHANGED
@@ -18,10 +18,10 @@ def test_spacy_error():
18
18
 
19
19
 
20
20
  def test_spacy_empty():
21
- scrub = IDScrub([" ", "John Smith", ""])
21
+ scrub = IDScrub([" ", " John Smith", ""])
22
22
  scrubbed = scrub.scrub(pipeline=[{"method": "spacy_entities"}])
23
23
 
24
- assert scrubbed == [" ", "[PERSON]", ""]
24
+ assert scrubbed == [" ", " [PERSON]", ""]
25
25
  assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
26
26
 
27
27