pyannotators-patterns 0.5.78__tar.gz → 0.5.82__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/PKG-INFO +2 -1
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/pyproject.toml +1 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/setup.py +2 -1
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/src/pyannotators_patterns/__init__.py +1 -1
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/src/pyannotators_patterns/patterns.py +45 -41
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/.bumpversion.cfg +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/.github/workflows/main.yml +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/.gitignore +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/.pre-commit-config.yaml +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/.readthedocs.yml +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/AUTHORS.md +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/CHANGELOG.md +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/Dockerfile +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/Jenkinsfile +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/LICENSE +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/README.md +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/RELEASE.md +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/bumpversion.py +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/docs/.gitignore +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/docs/CHANGELOG.md +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/docs/LICENSE +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/docs/_static/.gitkeep +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/docs/_templates/.gitkeep +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/docs/conf.py +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/docs/index.rst +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/hgnc_cache.sqlite +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/mypy.ini +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/tests/assertions.py +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/tests/test_credit_cards.py +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/tests/test_emails.py +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/tests/test_zip.py +0 -0
- {pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/tox.ini +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pyannotators-patterns
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.82
|
|
4
4
|
Summary: Annotator based on Presidio pattern recognizer
|
|
5
5
|
Home-page: https://github.com/oterrier/pyannotators_patterns/
|
|
6
6
|
Keywords:
|
|
@@ -21,6 +21,7 @@ Classifier: Development Status :: 4 - Beta
|
|
|
21
21
|
Classifier: Programming Language :: Python :: 3.8
|
|
22
22
|
Requires-Dist: pymultirole-plugins>=0.5.0,<0.6.0
|
|
23
23
|
Requires-Dist: spacy[lookups]==3.4.4
|
|
24
|
+
Requires-Dist: log-with-context
|
|
24
25
|
Requires-Dist: collections_extended
|
|
25
26
|
Requires-Dist: unidecode
|
|
26
27
|
Requires-Dist: presidio-analyzer>=2.2.352
|
|
@@ -15,6 +15,7 @@ package_dir = \
|
|
|
15
15
|
install_requires = \
|
|
16
16
|
['pymultirole-plugins>=0.5.0,<0.6.0',
|
|
17
17
|
'spacy[lookups]==3.4.4',
|
|
18
|
+
'log-with-context',
|
|
18
19
|
'collections_extended',
|
|
19
20
|
'unidecode',
|
|
20
21
|
'presidio-analyzer>=2.2.352']
|
|
@@ -39,7 +40,7 @@ entry_points = \
|
|
|
39
40
|
'pyannotators_patterns.patterns:PatternsAnnotator']}
|
|
40
41
|
|
|
41
42
|
setup(name='pyannotators-patterns',
|
|
42
|
-
version='0.5.
|
|
43
|
+
version='0.5.82',
|
|
43
44
|
description='Annotator based on Presidio pattern recognizer',
|
|
44
45
|
author='Olivier Terrier',
|
|
45
46
|
author_email='olivier.terrier@kairntech.com',
|
{pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/src/pyannotators_patterns/__init__.py
RENAMED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""Annotator based on Presidio pattern recognizer"""
|
|
2
|
-
__version__ = "0.5.
|
|
2
|
+
__version__ = "0.5.82"
|
{pyannotators_patterns-0.5.78 → pyannotators_patterns-0.5.82}/src/pyannotators_patterns/patterns.py
RENAMED
|
@@ -4,6 +4,7 @@ from functools import lru_cache
|
|
|
4
4
|
from typing import Type, List, cast, Dict
|
|
5
5
|
|
|
6
6
|
import spacy
|
|
7
|
+
from log_with_context import add_logging_context
|
|
7
8
|
from presidio_analyzer import Pattern, PatternRecognizer, RecognizerRegistry, AnalyzerEngine, LemmaContextAwareEnhancer
|
|
8
9
|
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NerModelConfiguration
|
|
9
10
|
from pydantic import BaseModel, Field
|
|
@@ -80,46 +81,46 @@ class PatternsAnnotator(AnnotatorBase):
|
|
|
80
81
|
|
|
81
82
|
mapping = frozenset(params.mapping.items())
|
|
82
83
|
labels = list(params.mapping.keys())
|
|
83
|
-
registry = get_registry(mapping)
|
|
84
|
-
|
|
85
84
|
for document in documents:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
document.
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
85
|
+
with add_logging_context(docid=document.identifier):
|
|
86
|
+
# Retrieve nlp pipe
|
|
87
|
+
lang = document_language(document, None)
|
|
88
|
+
if lang is None or lang not in supported_languages:
|
|
89
|
+
raise AttributeError(f"Metadata language {lang} is required and must be in {SUPPORTED_LANGUAGES}")
|
|
90
|
+
nlp = get_nlp(lang)
|
|
91
|
+
registry = get_registry(mapping, lang)
|
|
92
|
+
analyzer = AnalyzerEngine(registry=registry,
|
|
93
|
+
nlp_engine=LoadedSpacyNlpEngine(lang, nlp),
|
|
94
|
+
default_score_threshold=0,
|
|
95
|
+
supported_languages=supported_languages,
|
|
96
|
+
context_aware_enhancer=LemmaContextAwareEnhancer(params.context_similarity_factor,
|
|
97
|
+
params.min_score_with_context_similarity,
|
|
98
|
+
params.context_prefix_count,
|
|
99
|
+
params.context_suffix_count))
|
|
100
|
+
|
|
101
|
+
document.annotations = []
|
|
102
|
+
if not document.sentences:
|
|
103
|
+
document.sentences = [Span(start=0, end=len(document.text))]
|
|
104
|
+
|
|
105
|
+
for s in document.sentences:
|
|
106
|
+
if s.end > s.start:
|
|
107
|
+
stext = document.text[s.start: s.end]
|
|
108
|
+
results = analyzer.analyze(text=stext, entities=labels,
|
|
109
|
+
language=lang, return_decision_process=True,
|
|
110
|
+
score_threshold=params.score_threshold) # noqa D501
|
|
111
|
+
for result in results:
|
|
112
|
+
start = s.start + result.start
|
|
113
|
+
end = s.start + + result.end
|
|
114
|
+
document.annotations.append(
|
|
115
|
+
Annotation(
|
|
116
|
+
start=start,
|
|
117
|
+
end=end,
|
|
118
|
+
text=document.text[start: end],
|
|
119
|
+
labelName=result.entity_type,
|
|
120
|
+
score=result.score,
|
|
121
|
+
properties={"analysis_explanation": result.analysis_explanation}
|
|
122
|
+
)
|
|
121
123
|
)
|
|
122
|
-
)
|
|
123
124
|
|
|
124
125
|
return documents
|
|
125
126
|
|
|
@@ -162,13 +163,16 @@ class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
|
|
162
163
|
|
|
163
164
|
|
|
164
165
|
@lru_cache(maxsize=None)
|
|
165
|
-
def get_registry(mapping_items):
|
|
166
|
+
def get_registry(mapping_items, lang):
|
|
166
167
|
recognizers = []
|
|
167
168
|
for pname, pvalue in mapping_items:
|
|
168
|
-
|
|
169
|
+
try:
|
|
170
|
+
pattern_definition = json.loads(pvalue)
|
|
171
|
+
except BaseException:
|
|
172
|
+
logger.warning(f"Invalid json: {pvalue}", exc_info=True)
|
|
169
173
|
patterns = [Pattern.from_dict(pat) for pat in pattern_definition['patterns']]
|
|
170
174
|
recognizer = PatternRecognizer(
|
|
171
|
-
supported_entity=pname, patterns=patterns, context=pattern_definition.get('context', None)
|
|
175
|
+
supported_entity=pname, supported_language=lang, patterns=patterns, context=pattern_definition.get('context', None)
|
|
172
176
|
)
|
|
173
177
|
recognizers.append(recognizer)
|
|
174
178
|
registry = RecognizerRegistry(recognizers)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|