datamarket 0.9.29__py3-none-any.whl → 0.9.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/utils/strings/__init__.py +2 -0
- datamarket/utils/{strings.py → strings/normalization.py} +3 -3
- datamarket/utils/strings/obfuscation.py +153 -0
- {datamarket-0.9.29.dist-info → datamarket-0.9.30.dist-info}/METADATA +10 -5
- {datamarket-0.9.29.dist-info → datamarket-0.9.30.dist-info}/RECORD +7 -5
- {datamarket-0.9.29.dist-info → datamarket-0.9.30.dist-info}/WHEEL +1 -1
- {datamarket-0.9.29.dist-info → datamarket-0.9.30.dist-info}/LICENSE +0 -0
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
########################################################################################################################
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
|
+
import unicodedata
|
|
4
5
|
from enum import Enum, auto
|
|
5
6
|
from typing import Any
|
|
6
|
-
import unicodedata
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
|
-
from
|
|
10
|
-
from inflection import parameterize, underscore, titleize, camelize
|
|
9
|
+
from inflection import camelize, parameterize, titleize, underscore
|
|
11
10
|
from string_utils import prettify, strip_html
|
|
11
|
+
from unidecode import unidecode
|
|
12
12
|
|
|
13
13
|
########################################################################################################################
|
|
14
14
|
# CLASSES
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import warnings
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PiiDependenciesMissingError(ImportError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SpacyModelNotFoundError(ImportError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import phonenumbers
|
|
19
|
+
import spacy
|
|
20
|
+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
|
|
21
|
+
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
|
22
|
+
from presidio_analyzer.predefined_recognizers import PhoneRecognizer
|
|
23
|
+
from presidio_anonymizer import AnonymizerEngine
|
|
24
|
+
from spacy.language import Language
|
|
25
|
+
from spacy_langdetect import LanguageDetector
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
raise PiiDependenciesMissingError(
|
|
28
|
+
"One or more PII anonymization dependencies are missing. "
|
|
29
|
+
"Please install them by running: pip install datamarket[pii]\n"
|
|
30
|
+
f"Original error: {e}"
|
|
31
|
+
) from e
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
########################################################################################################################
|
|
35
|
+
# SETTINGS
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger()
|
|
38
|
+
logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
|
|
39
|
+
|
|
40
|
+
warnings.filterwarnings(
|
|
41
|
+
"ignore",
|
|
42
|
+
message=r"\[W108\]",
|
|
43
|
+
category=UserWarning,
|
|
44
|
+
module="spacy.pipeline.lemmatizer",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@Language.factory("language_detector")
|
|
49
|
+
def get_lang_detector(nlp, name):
|
|
50
|
+
return LanguageDetector()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
########################################################################################################################
|
|
54
|
+
# CLASSES
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PiiAnonymizer:
|
|
58
|
+
SUPPORTED_LANG = ["es", "en"]
|
|
59
|
+
|
|
60
|
+
def __init__(self):
|
|
61
|
+
# Check for required spaCy models
|
|
62
|
+
required_models = {
|
|
63
|
+
"en_core_web_md": "python -m spacy download en_core_web_md",
|
|
64
|
+
"es_core_news_md": "python -m spacy download es_core_news_md",
|
|
65
|
+
}
|
|
66
|
+
missing_models_instructions = []
|
|
67
|
+
for model_name, install_command in required_models.items():
|
|
68
|
+
if not spacy.util.is_package(model_name):
|
|
69
|
+
missing_models_instructions.append(
|
|
70
|
+
f"Model '{model_name}' not found. Please install it by running: {install_command}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if missing_models_instructions:
|
|
74
|
+
raise SpacyModelNotFoundError("\n".join(missing_models_instructions))
|
|
75
|
+
|
|
76
|
+
self.anonymizer = AnonymizerEngine()
|
|
77
|
+
self.analyzer = self._load_analyzer_engine()
|
|
78
|
+
|
|
79
|
+
self.nlp = self._nlp()
|
|
80
|
+
|
|
81
|
+
def _nlp(self) -> Language:
|
|
82
|
+
analyzer_en_model = self.analyzer.nlp_engine.nlp.get("en")
|
|
83
|
+
shared_vocab = analyzer_en_model.vocab
|
|
84
|
+
nlp = spacy.blank("en", vocab=shared_vocab)
|
|
85
|
+
|
|
86
|
+
if nlp.has_factory("sentencizer"):
|
|
87
|
+
nlp.add_pipe("sentencizer")
|
|
88
|
+
|
|
89
|
+
if nlp.has_factory("language_detector"):
|
|
90
|
+
nlp.add_pipe("language_detector", last=True)
|
|
91
|
+
|
|
92
|
+
return nlp
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _nlp_config():
|
|
96
|
+
return {
|
|
97
|
+
"nlp_engine_name": "spacy",
|
|
98
|
+
"models": [
|
|
99
|
+
{"lang_code": "es", "model_name": "es_core_news_md"},
|
|
100
|
+
{"lang_code": "en", "model_name": "en_core_web_md"},
|
|
101
|
+
],
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def _load_analyzer_engine(self) -> AnalyzerEngine:
|
|
105
|
+
provider = NlpEngineProvider(nlp_configuration=PiiAnonymizer._nlp_config())
|
|
106
|
+
nlp_engine = provider.create_engine()
|
|
107
|
+
phone_recognizer_es = PhoneRecognizer(
|
|
108
|
+
supported_language="es",
|
|
109
|
+
supported_regions=phonenumbers.SUPPORTED_REGIONS,
|
|
110
|
+
context=["teléfono", "móvil", "número"],
|
|
111
|
+
)
|
|
112
|
+
registry = RecognizerRegistry(supported_languages=self.SUPPORTED_LANG)
|
|
113
|
+
registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=self.SUPPORTED_LANG)
|
|
114
|
+
registry.add_recognizer(phone_recognizer_es)
|
|
115
|
+
|
|
116
|
+
analyzer = AnalyzerEngine(
|
|
117
|
+
registry=registry,
|
|
118
|
+
nlp_engine=nlp_engine,
|
|
119
|
+
supported_languages=self.SUPPORTED_LANG,
|
|
120
|
+
)
|
|
121
|
+
return analyzer
|
|
122
|
+
|
|
123
|
+
def detect_lang(self, text: str) -> str:
|
|
124
|
+
if hasattr(self, "nlp") and self.nlp:
|
|
125
|
+
with self.nlp.select_pipes(enable=["tokenizer", "sentencizer", "language_detector"]):
|
|
126
|
+
doc = self.nlp(text)
|
|
127
|
+
return doc._.language["language"]
|
|
128
|
+
else:
|
|
129
|
+
logger.error("Language detection NLP model not initialized. Cannot detect language.")
|
|
130
|
+
return "unknown"
|
|
131
|
+
|
|
132
|
+
def anonymize_text(
|
|
133
|
+
self,
|
|
134
|
+
text: str,
|
|
135
|
+
entities: Optional[List[str]] = None,
|
|
136
|
+
lang: str = "unknown",
|
|
137
|
+
) -> str:
|
|
138
|
+
if lang == "unknown":
|
|
139
|
+
lang = self.detect_lang(text)
|
|
140
|
+
if lang not in self.SUPPORTED_LANG:
|
|
141
|
+
logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
|
|
142
|
+
return ""
|
|
143
|
+
elif lang not in self.SUPPORTED_LANG:
|
|
144
|
+
logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
|
|
145
|
+
return ""
|
|
146
|
+
|
|
147
|
+
analyzer_result = self.analyzer.analyze(
|
|
148
|
+
text=text,
|
|
149
|
+
entities=entities,
|
|
150
|
+
language=lang,
|
|
151
|
+
)
|
|
152
|
+
anonymizer_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_result)
|
|
153
|
+
return anonymizer_result.text
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.30
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
|
+
Home-page: https://datamarket.es
|
|
5
6
|
License: GPL-3.0-or-later
|
|
6
7
|
Author: DataMarket
|
|
7
8
|
Author-email: techsupport@datamarket.es
|
|
8
|
-
Requires-Python: >=3.12,<
|
|
9
|
+
Requires-Python: >=3.12,<3.13
|
|
9
10
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
10
11
|
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
11
12
|
Classifier: Operating System :: OS Independent
|
|
12
13
|
Classifier: Programming Language :: Python :: 3
|
|
13
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
15
15
|
Provides-Extra: alchemy
|
|
16
16
|
Provides-Extra: aws
|
|
17
17
|
Provides-Extra: azure-storage-blob
|
|
@@ -40,6 +40,7 @@ Provides-Extra: openpyxl
|
|
|
40
40
|
Provides-Extra: pandas
|
|
41
41
|
Provides-Extra: pandera
|
|
42
42
|
Provides-Extra: peerdb
|
|
43
|
+
Provides-Extra: pii
|
|
43
44
|
Provides-Extra: pillow
|
|
44
45
|
Provides-Extra: playwright
|
|
45
46
|
Provides-Extra: playwright-stealth
|
|
@@ -96,6 +97,8 @@ Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
|
96
97
|
Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
|
|
97
98
|
Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
|
|
98
99
|
Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
|
|
100
|
+
Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
|
|
101
|
+
Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
|
|
99
102
|
Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
|
|
100
103
|
Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
|
|
101
104
|
Requires-Dist: pycountry (>=24.0.0,<25.0.0)
|
|
@@ -111,14 +114,16 @@ Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
|
|
|
111
114
|
Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
|
|
112
115
|
Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
|
|
113
116
|
Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
|
|
117
|
+
Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
|
|
118
|
+
Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
|
|
114
119
|
Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
|
|
115
120
|
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
121
|
+
Requires-Dist: tf-playwright-stealth (>=1.0.0,<2.0.0)
|
|
116
122
|
Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
|
|
117
123
|
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
118
124
|
Requires-Dist: unidecode (>=1.0.0,<2.0.0)
|
|
119
125
|
Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
|
|
120
126
|
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
121
|
-
Project-URL: Homepage, https://datamarket.es
|
|
122
127
|
Project-URL: Repository, https://github.com/Data-Market/datamarket
|
|
123
128
|
Description-Content-Type: text/markdown
|
|
124
129
|
|
|
@@ -16,10 +16,12 @@ datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,6
|
|
|
16
16
|
datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
|
|
17
17
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
18
18
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
19
|
-
datamarket/utils/strings.py,sha256=
|
|
19
|
+
datamarket/utils/strings/__init__.py,sha256=RmyN3hKGXmUym8w5tn28yWkw2uM-b5OvntB4D0lU1eo,84
|
|
20
|
+
datamarket/utils/strings/normalization.py,sha256=337M2UPwEETvhVTOnP4w_igTXpHUHoaD8e7x_-L-Bpk,5654
|
|
21
|
+
datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
|
|
20
22
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
21
23
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
22
|
-
datamarket-0.9.
|
|
23
|
-
datamarket-0.9.
|
|
24
|
-
datamarket-0.9.
|
|
25
|
-
datamarket-0.9.
|
|
24
|
+
datamarket-0.9.30.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
25
|
+
datamarket-0.9.30.dist-info/METADATA,sha256=zzhHMrHhBf_CfBLwjj4melul8sCkcO8np-nmay0jKOQ,6871
|
|
26
|
+
datamarket-0.9.30.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
27
|
+
datamarket-0.9.30.dist-info/RECORD,,
|
|
File without changes
|