pyannotators-patterns 0.5.86__tar.gz → 0.5.88__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/PKG-INFO +1 -1
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/setup.py +1 -1
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/__init__.py +1 -1
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/patterns.py +1 -1
- pyannotators_patterns-0.5.88/tests/data/coords-document.json +8 -0
- pyannotators_patterns-0.5.88/tests/data/tel-document.json +8 -0
- pyannotators_patterns-0.5.88/tests/data/tel.json +21 -0
- pyannotators_patterns-0.5.88/tests/test_tel.py +32 -0
- pyannotators_patterns-0.5.86/tests/data/coords-document.json +0 -86
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/.bumpversion.cfg +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/.github/workflows/main.yml +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/.gitignore +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/.pre-commit-config.yaml +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/.readthedocs.yml +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/AUTHORS.md +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/CHANGELOG.md +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/Dockerfile +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/Jenkinsfile +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/LICENSE +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/README.md +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/RELEASE.md +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/bumpversion.py +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/docs/.gitignore +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/docs/CHANGELOG.md +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/docs/LICENSE +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/docs/_static/.gitkeep +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/docs/_templates/.gitkeep +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/docs/conf.py +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/docs/index.rst +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/mypy.ini +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/pyproject.toml +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/named_pattern_recognizer.py +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/tests/assertions.py +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/tests/data/coords.json +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/tests/test_coords.py +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/tests/test_credit_cards.py +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/tests/test_emails.py +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/tests/test_zip.py +0 -0
- {pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/tox.ini +0 -0
|
@@ -41,7 +41,7 @@ entry_points = \
|
|
|
41
41
|
'pyannotators_patterns.patterns:PatternsAnnotator']}
|
|
42
42
|
|
|
43
43
|
setup(name='pyannotators-patterns',
|
|
44
|
-
version='0.5.
|
|
44
|
+
version='0.5.88',
|
|
45
45
|
description='Annotator based on Presidio pattern recognizer',
|
|
46
46
|
author='Olivier Terrier',
|
|
47
47
|
author_email='olivier.terrier@kairntech.com',
|
{pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/__init__.py
RENAMED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
"""Annotator based on Presidio pattern recognizer"""
|
|
2
|
-
__version__ = "0.5.
|
|
2
|
+
__version__ = "0.5.88"
|
{pyannotators_patterns-0.5.86 → pyannotators_patterns-0.5.88}/src/pyannotators_patterns/patterns.py
RENAMED
|
@@ -52,7 +52,7 @@ class PatternsParameters(AnnotatorParameters):
|
|
|
52
52
|
```""" + PATTERNS_EXAMPLE_STR + "```", extra="key:label,val:json")
|
|
53
53
|
|
|
54
54
|
score_threshold: float = Field(0.0, description="Minimum confidence value for detected entities to be returned")
|
|
55
|
-
context_similarity_factor: float = Field(0.
|
|
55
|
+
context_similarity_factor: float = Field(0.35,
|
|
56
56
|
description="How much to enhance confidence of match entity, as explained [here](https://microsoft.github.io/presidio/tutorial/06_context/)",
|
|
57
57
|
extra="advanced")
|
|
58
58
|
min_score_with_context_similarity: float = Field(0.4,
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{
|
|
2
|
+
"text": " Coordonnées degrés décimaux, référence devant :\n Exemple : N85,8598654 W150,589654\n\n\n Coordonnées degrés décimaux, référence derrière : \n Exemple 85,8598654N 150,589654W",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"language": "fr"
|
|
5
|
+
},
|
|
6
|
+
"identifier": "c8ecc5153c542c4f6a07fbe813384842",
|
|
7
|
+
"title": "unknown test document"
|
|
8
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"patterns": [
|
|
3
|
+
{
|
|
4
|
+
"name": "avec préfixe",
|
|
5
|
+
"regex": "(00|\\+)( |\\-|\\.)?(?P<prefix>9[976]\\d|8[987530]\\d|6[987]\\d|5[90]\\d|42\\d|3[875]\\d|2[98654321]\\d|9[8543210]|8[6421]|6[6543210]|5[87654321]|4[987654310]|3[9643210]|2[70]|7|1)( |\\-|\\.)?(?P<number>(?:\\d{1,3}( |\\-|\\.)?){5,14})\\b",
|
|
6
|
+
"score": 0.5
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
"name": "sans préfixe",
|
|
10
|
+
"regex": "\\b(?P<number>(?:\\d{1,3}( |\\-|\\.)?){5,14})\\b",
|
|
11
|
+
"score": 0.5
|
|
12
|
+
}
|
|
13
|
+
],
|
|
14
|
+
"context": [
|
|
15
|
+
"appeler",
|
|
16
|
+
"appel",
|
|
17
|
+
"numéro",
|
|
18
|
+
"téléphone",
|
|
19
|
+
"téléphoner"
|
|
20
|
+
]
|
|
21
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from dirty_equals import IsPartialDict
|
|
6
|
+
from pymultirole_plugins.v1.schema import Document
|
|
7
|
+
from pytest_check import check
|
|
8
|
+
|
|
9
|
+
from pyannotators_patterns.patterns import PatternsAnnotator, PatternsParameters
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_tel(
|
|
13
|
+
):
|
|
14
|
+
testdir = Path(__file__).parent
|
|
15
|
+
source = Path(testdir, "data/tel.json")
|
|
16
|
+
with source.open("r") as fin:
|
|
17
|
+
pat = json.load(fin)
|
|
18
|
+
parameters = PatternsParameters(mapping={
|
|
19
|
+
"telephone": json.dumps(pat, indent=2)
|
|
20
|
+
})
|
|
21
|
+
source = Path(testdir, "data/tel-document.json")
|
|
22
|
+
with source.open("r") as fin:
|
|
23
|
+
jdoc = json.load(fin)
|
|
24
|
+
doc = Document(**jdoc)
|
|
25
|
+
annotator = PatternsAnnotator()
|
|
26
|
+
docs: List[Document] = annotator.annotate([doc], parameters)
|
|
27
|
+
doc0 = docs[0]
|
|
28
|
+
tel = next(a.dict(exclude_none=True, exclude_unset=True) for a in doc0.annotations if
|
|
29
|
+
a.text == '+33.089-658-6494')
|
|
30
|
+
with check:
|
|
31
|
+
assert tel == IsPartialDict(labelName='telephone', text='+33.089-658-6494', score=0.85,
|
|
32
|
+
properties=IsPartialDict(prefix='33', number='089-658-6494'))
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"text": " Coordonnées degrés décimaux, référence devant :\n Exemple : N85,8598654 W150,589654\n\n\n Coordonnées degrés décimaux, référence derrière : \n Exemple 85,8598654N 150,589654W",
|
|
3
|
-
"metadata": {
|
|
4
|
-
"language": "fr"
|
|
5
|
-
},
|
|
6
|
-
"annotations": [
|
|
7
|
-
{
|
|
8
|
-
"start": 66,
|
|
9
|
-
"end": 77,
|
|
10
|
-
"labelName": "coordonnees_geographiques",
|
|
11
|
-
"text": "N85,8598654",
|
|
12
|
-
"score": 0.9,
|
|
13
|
-
"properties": {
|
|
14
|
-
"analysis_explanation": {
|
|
15
|
-
"recognizer": "PatternRecognizer",
|
|
16
|
-
"pattern_name": "ref_lat",
|
|
17
|
-
"pattern": "\\b(?P<ref_lat>N|S|n|s)[0-9,]+\\b",
|
|
18
|
-
"original_score": 0.9,
|
|
19
|
-
"score": 0.9,
|
|
20
|
-
"score_context_improvement": 0,
|
|
21
|
-
"supportive_context_word": "",
|
|
22
|
-
"regex_flags": 26
|
|
23
|
-
}
|
|
24
|
-
}
|
|
25
|
-
},
|
|
26
|
-
{
|
|
27
|
-
"start": 78,
|
|
28
|
-
"end": 89,
|
|
29
|
-
"labelName": "coordonnees_geographiques",
|
|
30
|
-
"text": "W150,589654",
|
|
31
|
-
"score": 0.9,
|
|
32
|
-
"properties": {
|
|
33
|
-
"analysis_explanation": {
|
|
34
|
-
"recognizer": "PatternRecognizer",
|
|
35
|
-
"pattern_name": "ref_long",
|
|
36
|
-
"pattern": "\\b(?P<ref_long>E|O|w|e|o)[0-9,]+\\b",
|
|
37
|
-
"original_score": 0.9,
|
|
38
|
-
"score": 0.9,
|
|
39
|
-
"score_context_improvement": 0,
|
|
40
|
-
"supportive_context_word": "",
|
|
41
|
-
"regex_flags": 26
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
},
|
|
45
|
-
{
|
|
46
|
-
"start": 160,
|
|
47
|
-
"end": 171,
|
|
48
|
-
"labelName": "coordonnees_geographiques",
|
|
49
|
-
"text": "85,8598654N",
|
|
50
|
-
"score": 0.9,
|
|
51
|
-
"properties": {
|
|
52
|
-
"analysis_explanation": {
|
|
53
|
-
"recognizer": "PatternRecognizer",
|
|
54
|
-
"pattern_name": "ref_lat_DEVANT",
|
|
55
|
-
"pattern": "\\b[0-9,]+(?P<ref_lat>N|S|n|s)\\b",
|
|
56
|
-
"original_score": 0.9,
|
|
57
|
-
"score": 0.9,
|
|
58
|
-
"score_context_improvement": 0,
|
|
59
|
-
"supportive_context_word": "",
|
|
60
|
-
"regex_flags": 26
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
},
|
|
64
|
-
{
|
|
65
|
-
"start": 172,
|
|
66
|
-
"end": 183,
|
|
67
|
-
"labelName": "coordonnees_geographiques",
|
|
68
|
-
"text": "150,589654W",
|
|
69
|
-
"score": 0.9,
|
|
70
|
-
"properties": {
|
|
71
|
-
"analysis_explanation": {
|
|
72
|
-
"recognizer": "PatternRecognizer",
|
|
73
|
-
"pattern_name": "ref_long_DEVANT",
|
|
74
|
-
"pattern": "\\b[0-9,]+(?P<ref_long>E|O|w|e|o)\\b",
|
|
75
|
-
"original_score": 0.9,
|
|
76
|
-
"score": 0.9,
|
|
77
|
-
"score_context_improvement": 0,
|
|
78
|
-
"supportive_context_word": "",
|
|
79
|
-
"regex_flags": 26
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
],
|
|
84
|
-
"identifier": "c8ecc5153c542c4f6a07fbe813384842",
|
|
85
|
-
"title": "unknown test document"
|
|
86
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|