datamarket 0.9.29__py3-none-any.whl → 0.9.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -0,0 +1,2 @@
1
+ from .normalization import * # noqa: F403
2
+ from .obfuscation import * # noqa: F403
@@ -1,14 +1,14 @@
1
1
  ########################################################################################################################
2
2
  # IMPORTS
3
3
 
4
+ import unicodedata
4
5
  from enum import Enum, auto
5
6
  from typing import Any
6
- import unicodedata
7
7
 
8
8
  import numpy as np
9
- from unidecode import unidecode
10
- from inflection import parameterize, underscore, titleize, camelize
9
+ from inflection import camelize, parameterize, titleize, underscore
11
10
  from string_utils import prettify, strip_html
11
+ from unidecode import unidecode
12
12
 
13
13
  ########################################################################################################################
14
14
  # CLASSES
@@ -0,0 +1,153 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ import warnings
6
+ from typing import List, Optional
7
+
8
+
9
+ class PiiDependenciesMissingError(ImportError):
10
+ pass
11
+
12
+
13
+ class SpacyModelNotFoundError(ImportError):
14
+ pass
15
+
16
+
17
+ try:
18
+ import phonenumbers
19
+ import spacy
20
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
21
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
22
+ from presidio_analyzer.predefined_recognizers import PhoneRecognizer
23
+ from presidio_anonymizer import AnonymizerEngine
24
+ from spacy.language import Language
25
+ from spacy_langdetect import LanguageDetector
26
+ except ImportError as e:
27
+ raise PiiDependenciesMissingError(
28
+ "One or more PII anonymization dependencies are missing. "
29
+ "Please install them by running: pip install datamarket[pii]\n"
30
+ f"Original error: {e}"
31
+ ) from e
32
+
33
+
34
+ ########################################################################################################################
35
+ # SETTINGS
36
+
37
+ logger = logging.getLogger()
38
+ logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
39
+
40
+ warnings.filterwarnings(
41
+ "ignore",
42
+ message=r"\[W108\]",
43
+ category=UserWarning,
44
+ module="spacy.pipeline.lemmatizer",
45
+ )
46
+
47
+
48
+ @Language.factory("language_detector")
49
+ def get_lang_detector(nlp, name):
50
+ return LanguageDetector()
51
+
52
+
53
+ ########################################################################################################################
54
+ # CLASSES
55
+
56
+
57
+ class PiiAnonymizer:
58
+ SUPPORTED_LANG = ["es", "en"]
59
+
60
+ def __init__(self):
61
+ # Check for required spaCy models
62
+ required_models = {
63
+ "en_core_web_md": "python -m spacy download en_core_web_md",
64
+ "es_core_news_md": "python -m spacy download es_core_news_md",
65
+ }
66
+ missing_models_instructions = []
67
+ for model_name, install_command in required_models.items():
68
+ if not spacy.util.is_package(model_name):
69
+ missing_models_instructions.append(
70
+ f"Model '{model_name}' not found. Please install it by running: {install_command}"
71
+ )
72
+
73
+ if missing_models_instructions:
74
+ raise SpacyModelNotFoundError("\n".join(missing_models_instructions))
75
+
76
+ self.anonymizer = AnonymizerEngine()
77
+ self.analyzer = self._load_analyzer_engine()
78
+
79
+ self.nlp = self._nlp()
80
+
81
+ def _nlp(self) -> Language:
82
+ analyzer_en_model = self.analyzer.nlp_engine.nlp.get("en")
83
+ shared_vocab = analyzer_en_model.vocab
84
+ nlp = spacy.blank("en", vocab=shared_vocab)
85
+
86
+ if nlp.has_factory("sentencizer"):
87
+ nlp.add_pipe("sentencizer")
88
+
89
+ if nlp.has_factory("language_detector"):
90
+ nlp.add_pipe("language_detector", last=True)
91
+
92
+ return nlp
93
+
94
+ @staticmethod
95
+ def _nlp_config():
96
+ return {
97
+ "nlp_engine_name": "spacy",
98
+ "models": [
99
+ {"lang_code": "es", "model_name": "es_core_news_md"},
100
+ {"lang_code": "en", "model_name": "en_core_web_md"},
101
+ ],
102
+ }
103
+
104
+ def _load_analyzer_engine(self) -> AnalyzerEngine:
105
+ provider = NlpEngineProvider(nlp_configuration=PiiAnonymizer._nlp_config())
106
+ nlp_engine = provider.create_engine()
107
+ phone_recognizer_es = PhoneRecognizer(
108
+ supported_language="es",
109
+ supported_regions=phonenumbers.SUPPORTED_REGIONS,
110
+ context=["teléfono", "móvil", "número"],
111
+ )
112
+ registry = RecognizerRegistry(supported_languages=self.SUPPORTED_LANG)
113
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=self.SUPPORTED_LANG)
114
+ registry.add_recognizer(phone_recognizer_es)
115
+
116
+ analyzer = AnalyzerEngine(
117
+ registry=registry,
118
+ nlp_engine=nlp_engine,
119
+ supported_languages=self.SUPPORTED_LANG,
120
+ )
121
+ return analyzer
122
+
123
+ def detect_lang(self, text: str) -> str:
124
+ if hasattr(self, "nlp") and self.nlp:
125
+ with self.nlp.select_pipes(enable=["tokenizer", "sentencizer", "language_detector"]):
126
+ doc = self.nlp(text)
127
+ return doc._.language["language"]
128
+ else:
129
+ logger.error("Language detection NLP model not initialized. Cannot detect language.")
130
+ return "unknown"
131
+
132
+ def anonymize_text(
133
+ self,
134
+ text: str,
135
+ entities: Optional[List[str]] = None,
136
+ lang: str = "unknown",
137
+ ) -> str:
138
+ if lang == "unknown":
139
+ lang = self.detect_lang(text)
140
+ if lang not in self.SUPPORTED_LANG:
141
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
142
+ return ""
143
+ elif lang not in self.SUPPORTED_LANG:
144
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
145
+ return ""
146
+
147
+ analyzer_result = self.analyzer.analyze(
148
+ text=text,
149
+ entities=entities,
150
+ language=lang,
151
+ )
152
+ anonymizer_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_result)
153
+ return anonymizer_result.text
@@ -1,17 +1,17 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.1
2
2
  Name: datamarket
3
- Version: 0.9.29
3
+ Version: 0.9.30
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
+ Home-page: https://datamarket.es
5
6
  License: GPL-3.0-or-later
6
7
  Author: DataMarket
7
8
  Author-email: techsupport@datamarket.es
8
- Requires-Python: >=3.12,<4.0
9
+ Requires-Python: >=3.12,<3.13
9
10
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
11
  Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
12
  Classifier: Operating System :: OS Independent
12
13
  Classifier: Programming Language :: Python :: 3
13
14
  Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
15
  Provides-Extra: alchemy
16
16
  Provides-Extra: aws
17
17
  Provides-Extra: azure-storage-blob
@@ -40,6 +40,7 @@ Provides-Extra: openpyxl
40
40
  Provides-Extra: pandas
41
41
  Provides-Extra: pandera
42
42
  Provides-Extra: peerdb
43
+ Provides-Extra: pii
43
44
  Provides-Extra: pillow
44
45
  Provides-Extra: playwright
45
46
  Provides-Extra: playwright-stealth
@@ -96,6 +97,8 @@ Requires-Dist: pendulum (>=3.0.0,<4.0.0)
96
97
  Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
97
98
  Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
98
99
  Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
100
+ Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
101
+ Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
99
102
  Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
100
103
  Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
101
104
  Requires-Dist: pycountry (>=24.0.0,<25.0.0)
@@ -111,14 +114,16 @@ Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
111
114
  Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
112
115
  Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
113
116
  Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
117
+ Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
118
+ Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
114
119
  Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
115
120
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
121
+ Requires-Dist: tf-playwright-stealth (>=1.0.0,<2.0.0)
116
122
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
117
123
  Requires-Dist: typer (>=0.15.0,<0.16.0)
118
124
  Requires-Dist: unidecode (>=1.0.0,<2.0.0)
119
125
  Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
120
126
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
121
- Project-URL: Homepage, https://datamarket.es
122
127
  Project-URL: Repository, https://github.com/Data-Market/datamarket
123
128
  Description-Content-Type: text/markdown
124
129
 
@@ -16,10 +16,12 @@ datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,6
16
16
  datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
17
17
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
18
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
- datamarket/utils/strings.py,sha256=rEX9NeBG4C7RECgT0EQebgoFoxgZMy9-7EcBSxgBANU,5654
19
+ datamarket/utils/strings/__init__.py,sha256=RmyN3hKGXmUym8w5tn28yWkw2uM-b5OvntB4D0lU1eo,84
20
+ datamarket/utils/strings/normalization.py,sha256=337M2UPwEETvhVTOnP4w_igTXpHUHoaD8e7x_-L-Bpk,5654
21
+ datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
20
22
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
21
23
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
22
- datamarket-0.9.29.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
23
- datamarket-0.9.29.dist-info/METADATA,sha256=h4DuPT0ToLAN6vSLidYyriB9gtKjjaDPcf2MWH5fm44,6588
24
- datamarket-0.9.29.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
25
- datamarket-0.9.29.dist-info/RECORD,,
24
+ datamarket-0.9.30.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
25
+ datamarket-0.9.30.dist-info/METADATA,sha256=zzhHMrHhBf_CfBLwjj4melul8sCkcO8np-nmay0jKOQ,6871
26
+ datamarket-0.9.30.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
27
+ datamarket-0.9.30.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.3
2
+ Generator: poetry-core 1.9.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any