idscrub 2.0.0__tar.gz → 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {idscrub-2.0.0 → idscrub-2.0.1}/PKG-INFO +1 -1
  2. {idscrub-2.0.0 → idscrub-2.0.1}/idscrub/scrub.py +4 -2
  3. {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/PKG-INFO +1 -1
  4. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_presidio.py +8 -0
  5. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_spacy.py +2 -2
  6. {idscrub-2.0.0 → idscrub-2.0.1}/.github/pull_request_template.md +0 -0
  7. {idscrub-2.0.0 → idscrub-2.0.1}/.github/workflows/cd.yml +0 -0
  8. {idscrub-2.0.0 → idscrub-2.0.1}/.github/workflows/ci.yml +0 -0
  9. {idscrub-2.0.0 → idscrub-2.0.1}/.gitignore +0 -0
  10. {idscrub-2.0.0 → idscrub-2.0.1}/.pre-commit-config.yaml +0 -0
  11. {idscrub-2.0.0 → idscrub-2.0.1}/CODEOWNERS +0 -0
  12. {idscrub-2.0.0 → idscrub-2.0.1}/LICENSE +0 -0
  13. {idscrub-2.0.0 → idscrub-2.0.1}/Makefile +0 -0
  14. {idscrub-2.0.0 → idscrub-2.0.1}/README.md +0 -0
  15. {idscrub-2.0.0 → idscrub-2.0.1}/SECURITY_CHECKLIST.md +0 -0
  16. {idscrub-2.0.0 → idscrub-2.0.1}/idscrub/__init__.py +0 -0
  17. {idscrub-2.0.0 → idscrub-2.0.1}/idscrub/locations.py +0 -0
  18. {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/SOURCES.txt +0 -0
  19. {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/dependency_links.txt +0 -0
  20. {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/requires.txt +0 -0
  21. {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/top_level.txt +0 -0
  22. {idscrub-2.0.0 → idscrub-2.0.1}/notebooks/basic_usage.ipynb +0 -0
  23. {idscrub-2.0.0 → idscrub-2.0.1}/pyproject.toml +0 -0
  24. {idscrub-2.0.0 → idscrub-2.0.1}/setup.cfg +0 -0
  25. {idscrub-2.0.0 → idscrub-2.0.1}/test/conftest.py +0 -0
  26. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_dataframe.py +0 -0
  27. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_errors.py +0 -0
  28. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_exclude.py +0 -0
  29. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_group.py +0 -0
  30. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_huggingface.py +0 -0
  31. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_id.py +0 -0
  32. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_label.py +0 -0
  33. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_overlap.py +0 -0
  34. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_phonenumbers.py +0 -0
  35. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_regex.py +0 -0
  36. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_scrub.py +0 -0
  37. {idscrub-2.0.0 → idscrub-2.0.1}/test/test_scrub_text.py +0 -0
  38. {idscrub-2.0.0 → idscrub-2.0.1}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: idscrub
3
- Version: 2.0.0
3
+ Version: 2.0.1
4
4
  Author: Department for Business and Trade
5
5
  Classifier: Development Status :: 3 - Alpha
6
6
  Requires-Python: >=3.12
@@ -634,7 +634,7 @@ class IDScrub:
634
634
  """
635
635
 
636
636
  nlp = self.get_spacy_model(model_name)
637
- stripped_texts = [s.strip() if s.isspace() else s for s in texts]
637
+ stripped_texts = ["" if s.strip() == "" else s for s in texts]
638
638
  docs = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
639
639
 
640
640
  idents = []
@@ -825,9 +825,11 @@ class IDScrub:
825
825
 
826
826
  analyzer = AnalyzerEngine(nlp_engine=loaded_nlp_engine)
827
827
 
828
+ stripped_texts = ["" if s.strip() == "" else s for s in texts]
829
+
828
830
  idents = []
829
831
 
830
- for text, text_id in zip(texts, text_ids):
832
+ for text, text_id in zip(stripped_texts, text_ids):
831
833
  results = analyzer.analyze(text=text, language="en", entities=entity_types)
832
834
  for res in results:
833
835
  if res.entity_type not in entity_types:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: idscrub
3
- Version: 2.0.0
3
+ Version: 2.0.1
4
4
  Author: Department for Business and Trade
5
5
  Classifier: Development Status :: 3 - Alpha
6
6
  Requires-Python: >=3.12
@@ -49,3 +49,11 @@ def test_presidio_get_data():
49
49
  )
50
50
 
51
51
  assert_frame_equal(df, expected_df)
52
+
53
+
54
+ def test_presidio_empty():
55
+ scrub = IDScrub([" ", " John Smith", ""])
56
+ scrubbed = scrub.scrub(pipeline=[{"method": "presidio_entities", "entity_types": ["PERSON"]}])
57
+
58
+ assert scrubbed == [" ", " [PERSON]", ""]
59
+ assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
@@ -18,10 +18,10 @@ def test_spacy_error():
18
18
 
19
19
 
20
20
  def test_spacy_empty():
21
- scrub = IDScrub([" ", "John Smith", ""])
21
+ scrub = IDScrub([" ", " John Smith", ""])
22
22
  scrubbed = scrub.scrub(pipeline=[{"method": "spacy_entities"}])
23
23
 
24
- assert scrubbed == [" ", "[PERSON]", ""]
24
+ assert scrubbed == [" ", " [PERSON]", ""]
25
25
  assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
26
26
 
27
27
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes