idscrub 2.0.0__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {idscrub-2.0.0 → idscrub-2.0.1}/PKG-INFO +1 -1
- {idscrub-2.0.0 → idscrub-2.0.1}/idscrub/scrub.py +4 -2
- {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/PKG-INFO +1 -1
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_presidio.py +8 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_spacy.py +2 -2
- {idscrub-2.0.0 → idscrub-2.0.1}/.github/pull_request_template.md +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/.github/workflows/cd.yml +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/.github/workflows/ci.yml +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/.gitignore +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/.pre-commit-config.yaml +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/CODEOWNERS +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/LICENSE +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/Makefile +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/README.md +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/SECURITY_CHECKLIST.md +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/idscrub/__init__.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/idscrub/locations.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/SOURCES.txt +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/dependency_links.txt +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/requires.txt +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/idscrub.egg-info/top_level.txt +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/notebooks/basic_usage.ipynb +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/pyproject.toml +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/setup.cfg +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/conftest.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_dataframe.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_errors.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_exclude.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_group.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_huggingface.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_id.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_label.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_overlap.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_phonenumbers.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_regex.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_scrub.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/test/test_scrub_text.py +0 -0
- {idscrub-2.0.0 → idscrub-2.0.1}/uv.lock +0 -0
|
@@ -634,7 +634,7 @@ class IDScrub:
|
|
|
634
634
|
"""
|
|
635
635
|
|
|
636
636
|
nlp = self.get_spacy_model(model_name)
|
|
637
|
-
stripped_texts = [
|
|
637
|
+
stripped_texts = ["" if s.strip() == "" else s for s in texts]
|
|
638
638
|
docs = nlp.pipe(stripped_texts, n_process=n_process, batch_size=batch_size)
|
|
639
639
|
|
|
640
640
|
idents = []
|
|
@@ -825,9 +825,11 @@ class IDScrub:
|
|
|
825
825
|
|
|
826
826
|
analyzer = AnalyzerEngine(nlp_engine=loaded_nlp_engine)
|
|
827
827
|
|
|
828
|
+
stripped_texts = ["" if s.strip() == "" else s for s in texts]
|
|
829
|
+
|
|
828
830
|
idents = []
|
|
829
831
|
|
|
830
|
-
for text, text_id in zip(
|
|
832
|
+
for text, text_id in zip(stripped_texts, text_ids):
|
|
831
833
|
results = analyzer.analyze(text=text, language="en", entities=entity_types)
|
|
832
834
|
for res in results:
|
|
833
835
|
if res.entity_type not in entity_types:
|
|
@@ -49,3 +49,11 @@ def test_presidio_get_data():
|
|
|
49
49
|
)
|
|
50
50
|
|
|
51
51
|
assert_frame_equal(df, expected_df)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_presidio_empty():
|
|
55
|
+
scrub = IDScrub([" ", " John Smith", ""])
|
|
56
|
+
scrubbed = scrub.scrub(pipeline=[{"method": "presidio_entities", "entity_types": ["PERSON"]}])
|
|
57
|
+
|
|
58
|
+
assert scrubbed == [" ", " [PERSON]", ""]
|
|
59
|
+
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
|
|
@@ -18,10 +18,10 @@ def test_spacy_error():
|
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
def test_spacy_empty():
|
|
21
|
-
scrub = IDScrub([" ", "John Smith", ""])
|
|
21
|
+
scrub = IDScrub([" ", " John Smith", ""])
|
|
22
22
|
scrubbed = scrub.scrub(pipeline=[{"method": "spacy_entities"}])
|
|
23
23
|
|
|
24
|
-
assert scrubbed == [" ", "[PERSON]", ""]
|
|
24
|
+
assert scrubbed == [" ", " [PERSON]", ""]
|
|
25
25
|
assert_frame_equal(scrub.get_scrubbed_data(), pd.DataFrame({"text_id": 2, "person": [["John Smith"]]}))
|
|
26
26
|
|
|
27
27
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|