pyannotators-patterns 0.5.84__tar.gz → 0.5.88__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/PKG-INFO +2 -1
  2. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/pyproject.toml +1 -1
  3. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/setup.py +2 -1
  4. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/__init__.py +1 -1
  5. pyannotators_patterns-0.5.88/src/pyannotators_patterns/named_pattern_recognizer.py +114 -0
  6. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/patterns.py +9 -4
  7. pyannotators_patterns-0.5.88/tests/data/coords-document.json +8 -0
  8. pyannotators_patterns-0.5.88/tests/data/coords.json +15 -0
  9. pyannotators_patterns-0.5.88/tests/data/tel-document.json +8 -0
  10. pyannotators_patterns-0.5.88/tests/data/tel.json +21 -0
  11. pyannotators_patterns-0.5.88/tests/test_coords.py +37 -0
  12. pyannotators_patterns-0.5.88/tests/test_tel.py +32 -0
  13. pyannotators_patterns-0.5.84/hgnc_cache.sqlite +0 -0
  14. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.bumpversion.cfg +0 -0
  15. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.github/workflows/main.yml +0 -0
  16. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.gitignore +0 -0
  17. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.pre-commit-config.yaml +0 -0
  18. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/.readthedocs.yml +0 -0
  19. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/AUTHORS.md +0 -0
  20. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/CHANGELOG.md +0 -0
  21. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/Dockerfile +0 -0
  22. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/Jenkinsfile +0 -0
  23. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/LICENSE +0 -0
  24. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/README.md +0 -0
  25. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/RELEASE.md +0 -0
  26. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/bumpversion.py +0 -0
  27. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/.gitignore +0 -0
  28. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/CHANGELOG.md +0 -0
  29. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/LICENSE +0 -0
  30. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/_static/.gitkeep +0 -0
  31. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/_templates/.gitkeep +0 -0
  32. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/conf.py +0 -0
  33. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/docs/index.rst +0 -0
  34. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/mypy.ini +0 -0
  35. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tests/assertions.py +0 -0
  36. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tests/test_credit_cards.py +0 -0
  37. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tests/test_emails.py +0 -0
  38. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tests/test_zip.py +0 -0
  39. {pyannotators_patterns-0.5.84 → pyannotators_patterns-0.5.88}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pyannotators-patterns
3
- Version: 0.5.84
3
+ Version: 0.5.88
4
4
  Summary: Annotator based on Presidio pattern recognizer
5
5
  Home-page: https://github.com/oterrier/pyannotators_patterns/
6
6
  Keywords:
@@ -37,6 +37,7 @@ Requires-Dist: pytest ; extra == "test"
37
37
  Requires-Dist: pytest-cov ; extra == "test"
38
38
  Requires-Dist: pytest-flake8 ; extra == "test"
39
39
  Requires-Dist: pytest-black ; extra == "test"
40
+ Requires-Dist: pytest_check ; extra == "test"
40
41
  Requires-Dist: flake8==3.9.2 ; extra == "test"
41
42
  Requires-Dist: tox ; extra == "test"
42
43
  Requires-Dist: dirty-equals ; extra == "test"
@@ -43,7 +43,7 @@ test = [
43
43
  "pytest-cov",
44
44
  "pytest-flake8",
45
45
  "pytest-black",
46
- # "hypothesis",
46
+ "pytest_check",
47
47
  "flake8==3.9.2",
48
48
  "tox",
49
49
  "dirty-equals"
@@ -31,6 +31,7 @@ extras_require = \
31
31
  'pytest-cov',
32
32
  'pytest-flake8',
33
33
  'pytest-black',
34
+ 'pytest_check',
34
35
  'flake8==3.9.2',
35
36
  'tox',
36
37
  'dirty-equals']}
@@ -40,7 +41,7 @@ entry_points = \
40
41
  'pyannotators_patterns.patterns:PatternsAnnotator']}
41
42
 
42
43
  setup(name='pyannotators-patterns',
43
- version='0.5.84',
44
+ version='0.5.88',
44
45
  description='Annotator based on Presidio pattern recognizer',
45
46
  author='Olivier Terrier',
46
47
  author_email='olivier.terrier@kairntech.com',
@@ -1,2 +1,2 @@
1
1
  """Annotator based on Presidio pattern recognizer"""
2
- __version__ = "0.5.84"
2
+ __version__ = "0.5.88"
@@ -0,0 +1,114 @@
1
+ import re
2
+ from typing import List, Optional
3
+
4
+ from presidio_analyzer import Pattern, PatternRecognizer, RecognizerResult, EntityRecognizer
5
+ from presidio_analyzer.nlp_engine import NlpArtifacts
6
+
7
+
8
+ class NamedPatternRecognizer(PatternRecognizer):
9
+ def __init__(
10
+ self,
11
+ supported_entity: str,
12
+ name: str = None,
13
+ supported_language: str = "en",
14
+ patterns: List[Pattern] = None,
15
+ deny_list: List[str] = None,
16
+ context: List[str] = None,
17
+ deny_list_score: float = 1.0,
18
+ global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE | re.IGNORECASE,
19
+ version: str = "0.0.1",
20
+ ):
21
+ super().__init__(supported_entity, name, supported_language, patterns, deny_list, context, deny_list_score,
22
+ global_regex_flags, version)
23
+
24
+ def analyze(
25
+ self,
26
+ text: str,
27
+ entities: List[str],
28
+ nlp_artifacts: Optional[NlpArtifacts] = None,
29
+ regex_flags: Optional[int] = None,
30
+ ) -> List[RecognizerResult]:
31
+ """
32
+ Analyzes text to detect PII using regular expressions or deny-lists.
33
+
34
+ :param text: Text to be analyzed
35
+ :param entities: Entities this recognizer can detect
36
+ :param nlp_artifacts: Output values from the NLP engine
37
+ :param regex_flags: regex flags to be used in regex matching
38
+ :return:
39
+ """
40
+ results = []
41
+
42
+ if self.patterns:
43
+ pattern_result = self.__analyze_patterns(text, regex_flags)
44
+ results.extend(pattern_result)
45
+
46
+ return results
47
+
48
+ def __analyze_patterns(
49
+ self, text: str, flags: int = None
50
+ ) -> List[RecognizerResult]:
51
+ """
52
+ Evaluate all patterns in the provided text.
53
+
54
+ Including words in the provided deny-list
55
+
56
+ :param text: text to analyze
57
+ :param flags: regex flags
58
+ :return: A list of RecognizerResult
59
+ """
60
+ flags = flags if flags else self.global_regex_flags
61
+ results = []
62
+ for pattern in self.patterns:
63
+ # Compile regex if flags differ from flags the regex was compiled with
64
+ if not pattern.compiled_regex or pattern.compiled_with_flags != flags:
65
+ pattern.compiled_with_flags = flags
66
+ pattern.compiled_regex = re.compile(pattern.regex, flags=flags)
67
+
68
+ matches = pattern.compiled_regex.finditer(text)
69
+ for match in matches:
70
+ start, end = match.span()
71
+ current_match = text[start:end]
72
+
73
+ # Skip empty results
74
+ if current_match == "":
75
+ continue
76
+
77
+ score = pattern.score
78
+
79
+ validation_result = self.validate_result(current_match)
80
+ description = self.build_regex_explanation(
81
+ self.name,
82
+ pattern.name,
83
+ pattern.regex,
84
+ score,
85
+ validation_result,
86
+ flags,
87
+ )
88
+ pattern_result = RecognizerResult(
89
+ entity_type=self.supported_entities[0],
90
+ start=start,
91
+ end=end,
92
+ score=score,
93
+ analysis_explanation=description,
94
+ recognition_metadata=match.groupdict()
95
+ )
96
+
97
+ if validation_result is not None:
98
+ if validation_result:
99
+ pattern_result.score = EntityRecognizer.MAX_SCORE
100
+ else:
101
+ pattern_result.score = EntityRecognizer.MIN_SCORE
102
+
103
+ invalidation_result = self.invalidate_result(current_match)
104
+ if invalidation_result is not None and invalidation_result:
105
+ pattern_result.score = EntityRecognizer.MIN_SCORE
106
+
107
+ if pattern_result.score > EntityRecognizer.MIN_SCORE:
108
+ results.append(pattern_result)
109
+
110
+ # Update analysis explanation score following validation or invalidation
111
+ description.score = pattern_result.score
112
+
113
+ results = EntityRecognizer.remove_duplicates(results)
114
+ return results
@@ -5,7 +5,7 @@ from typing import Type, List, cast, Dict
5
5
 
6
6
  import spacy
7
7
  from log_with_context import add_logging_context
8
- from presidio_analyzer import Pattern, PatternRecognizer, RecognizerRegistry, AnalyzerEngine, LemmaContextAwareEnhancer
8
+ from presidio_analyzer import Pattern, RecognizerRegistry, AnalyzerEngine, LemmaContextAwareEnhancer, RecognizerResult
9
9
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NerModelConfiguration
10
10
  from pydantic import BaseModel, Field
11
11
  from pymultirole_plugins.util import comma_separated_to_list
@@ -15,6 +15,8 @@ from spacy.cli.download import download_model, get_compatibility, get_version
15
15
  from spacy.language import Language
16
16
  from wasabi import msg
17
17
 
18
+ from .named_pattern_recognizer import NamedPatternRecognizer
19
+
18
20
  logger = logging.getLogger(__name__)
19
21
 
20
22
  PATTERNS_EXAMPLE = {
@@ -50,7 +52,7 @@ class PatternsParameters(AnnotatorParameters):
50
52
  ```""" + PATTERNS_EXAMPLE_STR + "```", extra="key:label,val:json")
51
53
 
52
54
  score_threshold: float = Field(0.0, description="Minimum confidence value for detected entities to be returned")
53
- context_similarity_factor: float = Field(0.0,
55
+ context_similarity_factor: float = Field(0.35,
54
56
  description="How much to enhance confidence of match entity, as explained [here](https://microsoft.github.io/presidio/tutorial/06_context/)",
55
57
  extra="advanced")
56
58
  min_score_with_context_similarity: float = Field(0.4,
@@ -111,6 +113,9 @@ class PatternsAnnotator(AnnotatorBase):
111
113
  for result in results:
112
114
  start = s.start + result.start
113
115
  end = s.start + + result.end
116
+ props = {k: v for k, v in result.recognition_metadata.items() if
117
+ k not in [RecognizerResult.RECOGNIZER_NAME_KEY,
118
+ RecognizerResult.RECOGNIZER_IDENTIFIER_KEY]}
114
119
  document.annotations.append(
115
120
  Annotation(
116
121
  start=start,
@@ -118,7 +123,7 @@ class PatternsAnnotator(AnnotatorBase):
118
123
  text=document.text[start: end],
119
124
  labelName=result.entity_type,
120
125
  score=result.score,
121
- properties={"analysis_explanation": result.analysis_explanation}
126
+ properties=props
122
127
  )
123
128
  )
124
129
 
@@ -171,7 +176,7 @@ def get_registry(mapping_items, lang):
171
176
  except BaseException:
172
177
  logger.warning(f"Invalid json: {pvalue}", exc_info=True)
173
178
  patterns = [Pattern.from_dict(pat) for pat in pattern_definition['patterns']]
174
- recognizer = PatternRecognizer(
179
+ recognizer = NamedPatternRecognizer(
175
180
  supported_entity=pname, supported_language=lang, patterns=patterns, context=pattern_definition.get('context', None)
176
181
  )
177
182
  recognizers.append(recognizer)
@@ -0,0 +1,8 @@
1
+ {
2
+ "text": " Coordonnées degrés décimaux, référence devant :\n Exemple : N85,8598654 W150,589654\n\n\n Coordonnées degrés décimaux, référence derrière : \n Exemple 85,8598654N 150,589654W",
3
+ "metadata": {
4
+ "language": "fr"
5
+ },
6
+ "identifier": "c8ecc5153c542c4f6a07fbe813384842",
7
+ "title": "unknown test document"
8
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "patterns": [
3
+ {
4
+ "name": "ref_lat",
5
+ "regex": "\\b(?P<ref_lat>N|S|n|s)(?P<val_lat>[0-9,]+)\\b",
6
+ "score": 0.9
7
+ },
8
+ {
9
+ "name": "ref_lat_DEVANT",
10
+ "regex": "\\b(?P<val_lat>[0-9,]+)(?P<ref_lat>N|S|n|s)\\b",
11
+ "score": 0.9
12
+ }
13
+ ],
14
+ "context": []
15
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ "text": "Si vous êtes intéressé, veuillez nous appeler au +33.089-658-6494.",
3
+ "metadata": {
4
+ "language": "fr"
5
+ },
6
+ "identifier": "c9321541549ae539665b8bbf440cb1a0",
7
+ "title": "unknown test document"
8
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "patterns": [
3
+ {
4
+ "name": "avec préfixe",
5
+ "regex": "(00|\\+)( |\\-|\\.)?(?P<prefix>9[976]\\d|8[987530]\\d|6[987]\\d|5[90]\\d|42\\d|3[875]\\d|2[98654321]\\d|9[8543210]|8[6421]|6[6543210]|5[87654321]|4[987654310]|3[9643210]|2[70]|7|1)( |\\-|\\.)?(?P<number>(?:\\d{1,3}( |\\-|\\.)?){5,14})\\b",
6
+ "score": 0.5
7
+ },
8
+ {
9
+ "name": "sans préfixe",
10
+ "regex": "\\b(?P<number>(?:\\d{1,3}( |\\-|\\.)?){5,14})\\b",
11
+ "score": 0.5
12
+ }
13
+ ],
14
+ "context": [
15
+ "appeler",
16
+ "appel",
17
+ "numéro",
18
+ "téléphone",
19
+ "téléphoner"
20
+ ]
21
+ }
@@ -0,0 +1,37 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ from dirty_equals import IsPartialDict
6
+ from pymultirole_plugins.v1.schema import Document
7
+ from pytest_check import check
8
+
9
+ from pyannotators_patterns.patterns import PatternsAnnotator, PatternsParameters
10
+
11
+
12
+ def test_coords(
13
+ ):
14
+ testdir = Path(__file__).parent
15
+ source = Path(testdir, "data/coords.json")
16
+ with source.open("r") as fin:
17
+ pat = json.load(fin)
18
+ parameters = PatternsParameters(mapping={
19
+ "coords": json.dumps(pat, indent=2)
20
+ })
21
+ source = Path(testdir, "data/coords-document.json")
22
+ with source.open("r") as fin:
23
+ jdoc = json.load(fin)
24
+ doc = Document(**jdoc)
25
+ annotator = PatternsAnnotator()
26
+ docs: List[Document] = annotator.annotate([doc], parameters)
27
+ doc0 = docs[0]
28
+ lat0 = next(a.dict(exclude_none=True, exclude_unset=True) for a in doc0.annotations if
29
+ a.text == 'N85,8598654')
30
+ with check:
31
+ assert lat0 == IsPartialDict(labelName='coords', text='N85,8598654',
32
+ properties=IsPartialDict(ref_lat='N', val_lat='85,8598654'))
33
+
34
+ lat1 = next(a.dict(exclude_none=True, exclude_unset=True) for a in doc0.annotations if
35
+ a.text == '85,8598654N')
36
+ with check:
37
+ assert lat1 == IsPartialDict(labelName='coords', text='85,8598654N', properties=IsPartialDict(ref_lat='N', val_lat='85,8598654'))
@@ -0,0 +1,32 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ from dirty_equals import IsPartialDict
6
+ from pymultirole_plugins.v1.schema import Document
7
+ from pytest_check import check
8
+
9
+ from pyannotators_patterns.patterns import PatternsAnnotator, PatternsParameters
10
+
11
+
12
+ def test_tel(
13
+ ):
14
+ testdir = Path(__file__).parent
15
+ source = Path(testdir, "data/tel.json")
16
+ with source.open("r") as fin:
17
+ pat = json.load(fin)
18
+ parameters = PatternsParameters(mapping={
19
+ "telephone": json.dumps(pat, indent=2)
20
+ })
21
+ source = Path(testdir, "data/tel-document.json")
22
+ with source.open("r") as fin:
23
+ jdoc = json.load(fin)
24
+ doc = Document(**jdoc)
25
+ annotator = PatternsAnnotator()
26
+ docs: List[Document] = annotator.annotate([doc], parameters)
27
+ doc0 = docs[0]
28
+ tel = next(a.dict(exclude_none=True, exclude_unset=True) for a in doc0.annotations if
29
+ a.text == '+33.089-658-6494')
30
+ with check:
31
+ assert tel == IsPartialDict(labelName='telephone', text='+33.089-658-6494', score=0.85,
32
+ properties=IsPartialDict(prefix='33', number='089-658-6494'))