pyannotators-patterns 0.5.80__tar.gz → 0.5.84__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/PKG-INFO +3 -2
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/pyproject.toml +2 -1
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/setup.py +3 -2
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/src/pyannotators_patterns/__init__.py +1 -1
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/src/pyannotators_patterns/patterns.py +43 -38
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/.bumpversion.cfg +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/.github/workflows/main.yml +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/.gitignore +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/.pre-commit-config.yaml +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/.readthedocs.yml +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/AUTHORS.md +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/CHANGELOG.md +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/Dockerfile +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/Jenkinsfile +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/LICENSE +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/README.md +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/RELEASE.md +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/bumpversion.py +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/docs/.gitignore +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/docs/CHANGELOG.md +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/docs/LICENSE +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/docs/_static/.gitkeep +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/docs/_templates/.gitkeep +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/docs/conf.py +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/docs/index.rst +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/hgnc_cache.sqlite +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/mypy.ini +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/tests/assertions.py +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/tests/test_credit_cards.py +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/tests/test_emails.py +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/tests/test_zip.py +0 -0
- {pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/tox.ini +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pyannotators-patterns
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.84
|
|
4
4
|
Summary: Annotator based on Presidio pattern recognizer
|
|
5
5
|
Home-page: https://github.com/oterrier/pyannotators_patterns/
|
|
6
6
|
Keywords:
|
|
@@ -21,9 +21,10 @@ Classifier: Development Status :: 4 - Beta
|
|
|
21
21
|
Classifier: Programming Language :: Python :: 3.8
|
|
22
22
|
Requires-Dist: pymultirole-plugins>=0.5.0,<0.6.0
|
|
23
23
|
Requires-Dist: spacy[lookups]==3.4.4
|
|
24
|
+
Requires-Dist: log-with-context
|
|
24
25
|
Requires-Dist: collections_extended
|
|
25
26
|
Requires-Dist: unidecode
|
|
26
|
-
Requires-Dist: presidio-analyzer
|
|
27
|
+
Requires-Dist: presidio-analyzer==2.2.354
|
|
27
28
|
Requires-Dist: flit ; extra == "dev"
|
|
28
29
|
Requires-Dist: pre-commit ; extra == "dev"
|
|
29
30
|
Requires-Dist: bump2version ; extra == "dev"
|
|
@@ -27,9 +27,10 @@ classifiers = [
|
|
|
27
27
|
requires = [
|
|
28
28
|
"pymultirole-plugins>=0.5.0,<0.6.0",
|
|
29
29
|
"spacy[lookups]==3.4.4",
|
|
30
|
+
"log-with-context",
|
|
30
31
|
"collections_extended",
|
|
31
32
|
"unidecode",
|
|
32
|
-
"presidio-analyzer
|
|
33
|
+
"presidio-analyzer==2.2.354"
|
|
33
34
|
]
|
|
34
35
|
dist-name = "pyannotators-patterns"
|
|
35
36
|
|
|
@@ -15,9 +15,10 @@ package_dir = \
|
|
|
15
15
|
install_requires = \
|
|
16
16
|
['pymultirole-plugins>=0.5.0,<0.6.0',
|
|
17
17
|
'spacy[lookups]==3.4.4',
|
|
18
|
+
'log-with-context',
|
|
18
19
|
'collections_extended',
|
|
19
20
|
'unidecode',
|
|
20
|
-
'presidio-analyzer
|
|
21
|
+
'presidio-analyzer==2.2.354']
|
|
21
22
|
|
|
22
23
|
extras_require = \
|
|
23
24
|
{'dev': ['flit', 'pre-commit', 'bump2version'],
|
|
@@ -39,7 +40,7 @@ entry_points = \
|
|
|
39
40
|
'pyannotators_patterns.patterns:PatternsAnnotator']}
|
|
40
41
|
|
|
41
42
|
setup(name='pyannotators-patterns',
|
|
42
|
-
version='0.5.
|
|
43
|
+
version='0.5.84',
|
|
43
44
|
description='Annotator based on Presidio pattern recognizer',
|
|
44
45
|
author='Olivier Terrier',
|
|
45
46
|
author_email='olivier.terrier@kairntech.com',
|
{pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/src/pyannotators_patterns/__init__.py
RENAMED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""Annotator based on Presidio pattern recognizer"""
|
|
2
|
-
__version__ = "0.5.
|
|
2
|
+
__version__ = "0.5.84"
|
{pyannotators_patterns-0.5.80 → pyannotators_patterns-0.5.84}/src/pyannotators_patterns/patterns.py
RENAMED
|
@@ -4,6 +4,7 @@ from functools import lru_cache
|
|
|
4
4
|
from typing import Type, List, cast, Dict
|
|
5
5
|
|
|
6
6
|
import spacy
|
|
7
|
+
from log_with_context import add_logging_context
|
|
7
8
|
from presidio_analyzer import Pattern, PatternRecognizer, RecognizerRegistry, AnalyzerEngine, LemmaContextAwareEnhancer
|
|
8
9
|
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NerModelConfiguration
|
|
9
10
|
from pydantic import BaseModel, Field
|
|
@@ -81,44 +82,45 @@ class PatternsAnnotator(AnnotatorBase):
|
|
|
81
82
|
mapping = frozenset(params.mapping.items())
|
|
82
83
|
labels = list(params.mapping.keys())
|
|
83
84
|
for document in documents:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
85
|
+
with add_logging_context(docid=document.identifier):
|
|
86
|
+
# Retrieve nlp pipe
|
|
87
|
+
lang = document_language(document, None)
|
|
88
|
+
if lang is None or lang not in supported_languages:
|
|
89
|
+
raise AttributeError(f"Metadata language {lang} is required and must be in {SUPPORTED_LANGUAGES}")
|
|
90
|
+
nlp = get_nlp(lang)
|
|
91
|
+
registry = get_registry(mapping, lang)
|
|
92
|
+
analyzer = AnalyzerEngine(registry=registry,
|
|
93
|
+
nlp_engine=LoadedSpacyNlpEngine(lang, nlp),
|
|
94
|
+
default_score_threshold=0,
|
|
95
|
+
supported_languages=supported_languages,
|
|
96
|
+
context_aware_enhancer=LemmaContextAwareEnhancer(params.context_similarity_factor,
|
|
97
|
+
params.min_score_with_context_similarity,
|
|
98
|
+
params.context_prefix_count,
|
|
99
|
+
params.context_suffix_count))
|
|
100
|
+
|
|
101
|
+
document.annotations = []
|
|
102
|
+
if not document.sentences:
|
|
103
|
+
document.sentences = [Span(start=0, end=len(document.text))]
|
|
104
|
+
|
|
105
|
+
for s in document.sentences:
|
|
106
|
+
if s.end > s.start:
|
|
107
|
+
stext = document.text[s.start: s.end]
|
|
108
|
+
results = analyzer.analyze(text=stext, entities=labels,
|
|
109
|
+
language=lang, return_decision_process=True,
|
|
110
|
+
score_threshold=params.score_threshold) # noqa D501
|
|
111
|
+
for result in results:
|
|
112
|
+
start = s.start + result.start
|
|
113
|
+
end = s.start + + result.end
|
|
114
|
+
document.annotations.append(
|
|
115
|
+
Annotation(
|
|
116
|
+
start=start,
|
|
117
|
+
end=end,
|
|
118
|
+
text=document.text[start: end],
|
|
119
|
+
labelName=result.entity_type,
|
|
120
|
+
score=result.score,
|
|
121
|
+
properties={"analysis_explanation": result.analysis_explanation}
|
|
122
|
+
)
|
|
120
123
|
)
|
|
121
|
-
)
|
|
122
124
|
|
|
123
125
|
return documents
|
|
124
126
|
|
|
@@ -164,7 +166,10 @@ class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
|
|
164
166
|
def get_registry(mapping_items, lang):
|
|
165
167
|
recognizers = []
|
|
166
168
|
for pname, pvalue in mapping_items:
|
|
167
|
-
|
|
169
|
+
try:
|
|
170
|
+
pattern_definition = json.loads(pvalue)
|
|
171
|
+
except BaseException:
|
|
172
|
+
logger.warning(f"Invalid json: {pvalue}", exc_info=True)
|
|
168
173
|
patterns = [Pattern.from_dict(pat) for pat in pattern_definition['patterns']]
|
|
169
174
|
recognizer = PatternRecognizer(
|
|
170
175
|
supported_entity=pname, supported_language=lang, patterns=patterns, context=pattern_definition.get('context', None)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|