pyannotators-patterns 0.5.84__tar.gz → 0.5.88__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/PKG-INFO +2 -1
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/pyproject.toml +1 -1
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/setup.py +2 -1
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/__init__.py +1 -1
- pyannotators_patterns-0.5.88/src/pyannotators_patterns/named_pattern_recognizer.py +114 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/patterns.py +9 -4
- pyannotators_patterns-0.5.88/tests/data/coords-document.json +8 -0
- pyannotators_patterns-0.5.88/tests/data/coords.json +15 -0
- pyannotators_patterns-0.5.88/tests/data/tel-document.json +8 -0
- pyannotators_patterns-0.5.88/tests/data/tel.json +21 -0
- pyannotators_patterns-0.5.88/tests/test_coords.py +37 -0
- pyannotators_patterns-0.5.88/tests/test_tel.py +32 -0
- pyannotators_patterns-0.5.84/hgnc_cache.sqlite +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.bumpversion.cfg +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.github/workflows/main.yml +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.gitignore +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.pre-commit-config.yaml +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.readthedocs.yml +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/AUTHORS.md +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/CHANGELOG.md +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/Dockerfile +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/Jenkinsfile +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/LICENSE +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/README.md +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/RELEASE.md +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/bumpversion.py +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/.gitignore +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/CHANGELOG.md +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/LICENSE +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/_static/.gitkeep +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/_templates/.gitkeep +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/conf.py +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/index.rst +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/mypy.ini +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tests/assertions.py +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tests/test_credit_cards.py +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tests/test_emails.py +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tests/test_zip.py +0 -0
- {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tox.ini +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pyannotators-patterns
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.88
|
|
4
4
|
Summary: Annotator based on Presidio pattern recognizer
|
|
5
5
|
Home-page: https://github.com/oterrier/pyannotators_patterns/
|
|
6
6
|
Keywords:
|
|
@@ -37,6 +37,7 @@ Requires-Dist: pytest ; extra == "test"
|
|
|
37
37
|
Requires-Dist: pytest-cov ; extra == "test"
|
|
38
38
|
Requires-Dist: pytest-flake8 ; extra == "test"
|
|
39
39
|
Requires-Dist: pytest-black ; extra == "test"
|
|
40
|
+
Requires-Dist: pytest_check ; extra == "test"
|
|
40
41
|
Requires-Dist: flake8==3.9.2 ; extra == "test"
|
|
41
42
|
Requires-Dist: tox ; extra == "test"
|
|
42
43
|
Requires-Dist: dirty-equals ; extra == "test"
|
|
@@ -31,6 +31,7 @@ extras_require = \
|
|
|
31
31
|
'pytest-cov',
|
|
32
32
|
'pytest-flake8',
|
|
33
33
|
'pytest-black',
|
|
34
|
+
'pytest_check',
|
|
34
35
|
'flake8==3.9.2',
|
|
35
36
|
'tox',
|
|
36
37
|
'dirty-equals']}
|
|
@@ -40,7 +41,7 @@ entry_points = \
|
|
|
40
41
|
'pyannotators_patterns.patterns:PatternsAnnotator']}
|
|
41
42
|
|
|
42
43
|
setup(name='pyannotators-patterns',
|
|
43
|
-
version='0.5.
|
|
44
|
+
version='0.5.88',
|
|
44
45
|
description='Annotator based on Presidio pattern recognizer',
|
|
45
46
|
author='Olivier Terrier',
|
|
46
47
|
author_email='olivier.terrier@kairntech.com',
|
{pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/__init__.py
RENAMED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""Annotator based on Presidio pattern recognizer"""
|
|
2
|
-
__version__ = "0.5.
|
|
2
|
+
__version__ = "0.5.88"
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from presidio_analyzer import Pattern, PatternRecognizer, RecognizerResult, EntityRecognizer
|
|
5
|
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NamedPatternRecognizer(PatternRecognizer):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
supported_entity: str,
|
|
12
|
+
name: str = None,
|
|
13
|
+
supported_language: str = "en",
|
|
14
|
+
patterns: List[Pattern] = None,
|
|
15
|
+
deny_list: List[str] = None,
|
|
16
|
+
context: List[str] = None,
|
|
17
|
+
deny_list_score: float = 1.0,
|
|
18
|
+
global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE | re.IGNORECASE,
|
|
19
|
+
version: str = "0.0.1",
|
|
20
|
+
):
|
|
21
|
+
super().__init__(supported_entity, name, supported_language, patterns, deny_list, context, deny_list_score,
|
|
22
|
+
global_regex_flags, version)
|
|
23
|
+
|
|
24
|
+
def analyze(
|
|
25
|
+
self,
|
|
26
|
+
text: str,
|
|
27
|
+
entities: List[str],
|
|
28
|
+
nlp_artifacts: Optional[NlpArtifacts] = None,
|
|
29
|
+
regex_flags: Optional[int] = None,
|
|
30
|
+
) -> List[RecognizerResult]:
|
|
31
|
+
"""
|
|
32
|
+
Analyzes text to detect PII using regular expressions or deny-lists.
|
|
33
|
+
|
|
34
|
+
:param text: Text to be analyzed
|
|
35
|
+
:param entities: Entities this recognizer can detect
|
|
36
|
+
:param nlp_artifacts: Output values from the NLP engine
|
|
37
|
+
:param regex_flags: regex flags to be used in regex matching
|
|
38
|
+
:return:
|
|
39
|
+
"""
|
|
40
|
+
results = []
|
|
41
|
+
|
|
42
|
+
if self.patterns:
|
|
43
|
+
pattern_result = self.__analyze_patterns(text, regex_flags)
|
|
44
|
+
results.extend(pattern_result)
|
|
45
|
+
|
|
46
|
+
return results
|
|
47
|
+
|
|
48
|
+
def __analyze_patterns(
|
|
49
|
+
self, text: str, flags: int = None
|
|
50
|
+
) -> List[RecognizerResult]:
|
|
51
|
+
"""
|
|
52
|
+
Evaluate all patterns in the provided text.
|
|
53
|
+
|
|
54
|
+
Including words in the provided deny-list
|
|
55
|
+
|
|
56
|
+
:param text: text to analyze
|
|
57
|
+
:param flags: regex flags
|
|
58
|
+
:return: A list of RecognizerResult
|
|
59
|
+
"""
|
|
60
|
+
flags = flags if flags else self.global_regex_flags
|
|
61
|
+
results = []
|
|
62
|
+
for pattern in self.patterns:
|
|
63
|
+
# Compile regex if flags differ from flags the regex was compiled with
|
|
64
|
+
if not pattern.compiled_regex or pattern.compiled_with_flags != flags:
|
|
65
|
+
pattern.compiled_with_flags = flags
|
|
66
|
+
pattern.compiled_regex = re.compile(pattern.regex, flags=flags)
|
|
67
|
+
|
|
68
|
+
matches = pattern.compiled_regex.finditer(text)
|
|
69
|
+
for match in matches:
|
|
70
|
+
start, end = match.span()
|
|
71
|
+
current_match = text[start:end]
|
|
72
|
+
|
|
73
|
+
# Skip empty results
|
|
74
|
+
if current_match == "":
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
score = pattern.score
|
|
78
|
+
|
|
79
|
+
validation_result = self.validate_result(current_match)
|
|
80
|
+
description = self.build_regex_explanation(
|
|
81
|
+
self.name,
|
|
82
|
+
pattern.name,
|
|
83
|
+
pattern.regex,
|
|
84
|
+
score,
|
|
85
|
+
validation_result,
|
|
86
|
+
flags,
|
|
87
|
+
)
|
|
88
|
+
pattern_result = RecognizerResult(
|
|
89
|
+
entity_type=self.supported_entities[0],
|
|
90
|
+
start=start,
|
|
91
|
+
end=end,
|
|
92
|
+
score=score,
|
|
93
|
+
analysis_explanation=description,
|
|
94
|
+
recognition_metadata=match.groupdict()
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if validation_result is not None:
|
|
98
|
+
if validation_result:
|
|
99
|
+
pattern_result.score = EntityRecognizer.MAX_SCORE
|
|
100
|
+
else:
|
|
101
|
+
pattern_result.score = EntityRecognizer.MIN_SCORE
|
|
102
|
+
|
|
103
|
+
invalidation_result = self.invalidate_result(current_match)
|
|
104
|
+
if invalidation_result is not None and invalidation_result:
|
|
105
|
+
pattern_result.score = EntityRecognizer.MIN_SCORE
|
|
106
|
+
|
|
107
|
+
if pattern_result.score > EntityRecognizer.MIN_SCORE:
|
|
108
|
+
results.append(pattern_result)
|
|
109
|
+
|
|
110
|
+
# Update analysis explanation score following validation or invalidation
|
|
111
|
+
description.score = pattern_result.score
|
|
112
|
+
|
|
113
|
+
results = EntityRecognizer.remove_duplicates(results)
|
|
114
|
+
return results
|
{pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/patterns.py
RENAMED
|
@@ -5,7 +5,7 @@ from typing import Type, List, cast, Dict
|
|
|
5
5
|
|
|
6
6
|
import spacy
|
|
7
7
|
from log_with_context import add_logging_context
|
|
8
|
-
from presidio_analyzer import Pattern,
|
|
8
|
+
from presidio_analyzer import Pattern, RecognizerRegistry, AnalyzerEngine, LemmaContextAwareEnhancer, RecognizerResult
|
|
9
9
|
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NerModelConfiguration
|
|
10
10
|
from pydantic import BaseModel, Field
|
|
11
11
|
from pymultirole_plugins.util import comma_separated_to_list
|
|
@@ -15,6 +15,8 @@ from spacy.cli.download import download_model, get_compatibility, get_version
|
|
|
15
15
|
from spacy.language import Language
|
|
16
16
|
from wasabi import msg
|
|
17
17
|
|
|
18
|
+
from .named_pattern_recognizer import NamedPatternRecognizer
|
|
19
|
+
|
|
18
20
|
logger = logging.getLogger(__name__)
|
|
19
21
|
|
|
20
22
|
PATTERNS_EXAMPLE = {
|
|
@@ -50,7 +52,7 @@ class PatternsParameters(AnnotatorParameters):
|
|
|
50
52
|
```""" + PATTERNS_EXAMPLE_STR + "```", extra="key:label,val:json")
|
|
51
53
|
|
|
52
54
|
score_threshold: float = Field(0.0, description="Minimum confidence value for detected entities to be returned")
|
|
53
|
-
context_similarity_factor: float = Field(0.
|
|
55
|
+
context_similarity_factor: float = Field(0.35,
|
|
54
56
|
description="How much to enhance confidence of match entity, as explained [here](https://microsoft.github.io/presidio/tutorial/06_context/)",
|
|
55
57
|
extra="advanced")
|
|
56
58
|
min_score_with_context_similarity: float = Field(0.4,
|
|
@@ -111,6 +113,9 @@ class PatternsAnnotator(AnnotatorBase):
|
|
|
111
113
|
for result in results:
|
|
112
114
|
start = s.start + result.start
|
|
113
115
|
end = s.start + + result.end
|
|
116
|
+
props = {k: v for k, v in result.recognition_metadata.items() if
|
|
117
|
+
k not in [RecognizerResult.RECOGNIZER_NAME_KEY,
|
|
118
|
+
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY]}
|
|
114
119
|
document.annotations.append(
|
|
115
120
|
Annotation(
|
|
116
121
|
start=start,
|
|
@@ -118,7 +123,7 @@ class PatternsAnnotator(AnnotatorBase):
|
|
|
118
123
|
text=document.text[start: end],
|
|
119
124
|
labelName=result.entity_type,
|
|
120
125
|
score=result.score,
|
|
121
|
-
properties=
|
|
126
|
+
properties=props
|
|
122
127
|
)
|
|
123
128
|
)
|
|
124
129
|
|
|
@@ -171,7 +176,7 @@ def get_registry(mapping_items, lang):
|
|
|
171
176
|
except BaseException:
|
|
172
177
|
logger.warning(f"Invalid json: {pvalue}", exc_info=True)
|
|
173
178
|
patterns = [Pattern.from_dict(pat) for pat in pattern_definition['patterns']]
|
|
174
|
-
recognizer =
|
|
179
|
+
recognizer = NamedPatternRecognizer(
|
|
175
180
|
supported_entity=pname, supported_language=lang, patterns=patterns, context=pattern_definition.get('context', None)
|
|
176
181
|
)
|
|
177
182
|
recognizers.append(recognizer)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{
|
|
2
|
+
"text": " Coordonnées degrés décimaux, référence devant :\n Exemple : N85,8598654 W150,589654\n\n\n Coordonnées degrés décimaux, référence derrière : \n Exemple 85,8598654N 150,589654W",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"language": "fr"
|
|
5
|
+
},
|
|
6
|
+
"identifier": "c8ecc5153c542c4f6a07fbe813384842",
|
|
7
|
+
"title": "unknown test document"
|
|
8
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"patterns": [
|
|
3
|
+
{
|
|
4
|
+
"name": "ref_lat",
|
|
5
|
+
"regex": "\\b(?P<ref_lat>N|S|n|s)(?P<val_lat>[0-9,]+)\\b",
|
|
6
|
+
"score": 0.9
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
"name": "ref_lat_DEVANT",
|
|
10
|
+
"regex": "\\b(?P<val_lat>[0-9,]+)(?P<ref_lat>N|S|n|s)\\b",
|
|
11
|
+
"score": 0.9
|
|
12
|
+
}
|
|
13
|
+
],
|
|
14
|
+
"context": []
|
|
15
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"patterns": [
|
|
3
|
+
{
|
|
4
|
+
"name": "avec préfixe",
|
|
5
|
+
"regex": "(00|\\+)( |\\-|\\.)?(?P<prefix>9[976]\\d|8[987530]\\d|6[987]\\d|5[90]\\d|42\\d|3[875]\\d|2[98654321]\\d|9[8543210]|8[6421]|6[6543210]|5[87654321]|4[987654310]|3[9643210]|2[70]|7|1)( |\\-|\\.)?(?P<number>(?:\\d{1,3}( |\\-|\\.)?){5,14})\\b",
|
|
6
|
+
"score": 0.5
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
"name": "sans préfixe",
|
|
10
|
+
"regex": "\\b(?P<number>(?:\\d{1,3}( |\\-|\\.)?){5,14})\\b",
|
|
11
|
+
"score": 0.5
|
|
12
|
+
}
|
|
13
|
+
],
|
|
14
|
+
"context": [
|
|
15
|
+
"appeler",
|
|
16
|
+
"appel",
|
|
17
|
+
"numéro",
|
|
18
|
+
"téléphone",
|
|
19
|
+
"téléphoner"
|
|
20
|
+
]
|
|
21
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from dirty_equals import IsPartialDict
|
|
6
|
+
from pymultirole_plugins.v1.schema import Document
|
|
7
|
+
from pytest_check import check
|
|
8
|
+
|
|
9
|
+
from pyannotators_patterns.patterns import PatternsAnnotator, PatternsParameters
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_coords(
|
|
13
|
+
):
|
|
14
|
+
testdir = Path(__file__).parent
|
|
15
|
+
source = Path(testdir, "data/coords.json")
|
|
16
|
+
with source.open("r") as fin:
|
|
17
|
+
pat = json.load(fin)
|
|
18
|
+
parameters = PatternsParameters(mapping={
|
|
19
|
+
"coords": json.dumps(pat, indent=2)
|
|
20
|
+
})
|
|
21
|
+
source = Path(testdir, "data/coords-document.json")
|
|
22
|
+
with source.open("r") as fin:
|
|
23
|
+
jdoc = json.load(fin)
|
|
24
|
+
doc = Document(**jdoc)
|
|
25
|
+
annotator = PatternsAnnotator()
|
|
26
|
+
docs: List[Document] = annotator.annotate([doc], parameters)
|
|
27
|
+
doc0 = docs[0]
|
|
28
|
+
lat0 = next(a.dict(exclude_none=True, exclude_unset=True) for a in doc0.annotations if
|
|
29
|
+
a.text == 'N85,8598654')
|
|
30
|
+
with check:
|
|
31
|
+
assert lat0 == IsPartialDict(labelName='coords', text='N85,8598654',
|
|
32
|
+
properties=IsPartialDict(ref_lat='N', val_lat='85,8598654'))
|
|
33
|
+
|
|
34
|
+
lat1 = next(a.dict(exclude_none=True, exclude_unset=True) for a in doc0.annotations if
|
|
35
|
+
a.text == '85,8598654N')
|
|
36
|
+
with check:
|
|
37
|
+
assert lat1 == IsPartialDict(labelName='coords', text='85,8598654N', properties=IsPartialDict(ref_lat='N', val_lat='85,8598654'))
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from dirty_equals import IsPartialDict
|
|
6
|
+
from pymultirole_plugins.v1.schema import Document
|
|
7
|
+
from pytest_check import check
|
|
8
|
+
|
|
9
|
+
from pyannotators_patterns.patterns import PatternsAnnotator, PatternsParameters
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_tel(
|
|
13
|
+
):
|
|
14
|
+
testdir = Path(__file__).parent
|
|
15
|
+
source = Path(testdir, "data/tel.json")
|
|
16
|
+
with source.open("r") as fin:
|
|
17
|
+
pat = json.load(fin)
|
|
18
|
+
parameters = PatternsParameters(mapping={
|
|
19
|
+
"telephone": json.dumps(pat, indent=2)
|
|
20
|
+
})
|
|
21
|
+
source = Path(testdir, "data/tel-document.json")
|
|
22
|
+
with source.open("r") as fin:
|
|
23
|
+
jdoc = json.load(fin)
|
|
24
|
+
doc = Document(**jdoc)
|
|
25
|
+
annotator = PatternsAnnotator()
|
|
26
|
+
docs: List[Document] = annotator.annotate([doc], parameters)
|
|
27
|
+
doc0 = docs[0]
|
|
28
|
+
tel = next(a.dict(exclude_none=True, exclude_unset=True) for a in doc0.annotations if
|
|
29
|
+
a.text == '+33.089-658-6494')
|
|
30
|
+
with check:
|
|
31
|
+
assert tel == IsPartialDict(labelName='telephone', text='+33.089-658-6494', score=0.85,
|
|
32
|
+
properties=IsPartialDict(prefix='33', number='089-658-6494'))
|
|
Binary file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|