pyprocessors-iptc_mapper 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ """Sherpa IPTC category mapper"""
2
+ __version__ = "0.5.7"
@@ -0,0 +1,85 @@
1
+ from collections import namedtuple
2
+ from functools import lru_cache
3
+ from itertools import groupby
4
+ from pathlib import Path
5
+ from typing import Type, cast, List, Dict
6
+
7
+ import pandas as pd
8
+ from log_with_context import add_logging_context, Logger
9
+ from pydantic import BaseModel, Field
10
+ from pymultirole_plugins.v1.processor import ProcessorParameters, ProcessorBase
11
+ from pymultirole_plugins.v1.schema import Document, Category
12
+
13
+ logger = Logger("pymultirole")
14
+
15
+
16
+ class IPTCMapperParameters(ProcessorParameters):
17
+ label2iptc_mapping: Dict[str, str] = Field(None, description="Label to iptc mediatopic mapping",
18
+ extra="key:label")
19
+
20
+
21
+ def is_iptc_category(cat: Category):
22
+ ok = "0123456789_"
23
+ return all(c in ok for c in cat.labelName)
24
+
25
+
26
+ class IPTCMapper(ProcessorBase):
27
+ """Create categories from annotations"""
28
+
29
+ def process(
30
+ self, documents: List[Document], parameters: ProcessorParameters
31
+ ) -> List[Document]: # noqa: C901
32
+ params: IPTCMapperParameters = cast(IPTCMapperParameters, parameters)
33
+ topics = get_mediatopics()
34
+ mapping = {k: topics.get(v, None) for k, v in params.label2iptc_mapping.items()} if (params.label2iptc_mapping and len(
35
+ params.label2iptc_mapping) > 0) else {}
36
+ for document in documents:
37
+ with add_logging_context(docid=document.identifier):
38
+ iptc_categories = {}
39
+ mapped_categories = {}
40
+ sorted_categories = sorted(document.categories or [], key=lambda c: c.labelName, reverse=True)
41
+ for is_iptc, group in groupby(sorted_categories, is_iptc_category):
42
+ if is_iptc:
43
+ iptc_categories.update({c.labelName: c for c in group})
44
+ else:
45
+ for c in group:
46
+ if c.labelName in mapping:
47
+ mediatopic = mapping[c.labelName]
48
+ clabel = f"{mediatopic.path.replace('_', '/')} ({mediatopic.label})"
49
+ mapped_categories[mediatopic.path] = Category(labelName=mediatopic.path, label=clabel, score=c.score)
50
+ categories = list(iptc_categories.values())
51
+ for k, mapped_category in mapped_categories.items():
52
+ if k not in iptc_categories:
53
+ categories.append(mapped_category)
54
+ document.categories = categories
55
+ return documents
56
+
57
+ @classmethod
58
+ def get_model(cls) -> Type[BaseModel]:
59
+ return IPTCMapperParameters
60
+
61
+
62
+ Mediatopic = namedtuple("Mediatopic", ["label", "code", "path"])
63
+
64
+
65
+ @lru_cache(maxsize=None)
66
+ def get_mediatopics():
67
+ iptc = Path(__file__).parent / "IPTC-MediaTopic-NewsCodes.xlsx"
68
+ topics = {}
69
+ iptc_codes = pd.read_excel(iptc, header=1).fillna(value="")
70
+ levels = [None] * 6
71
+ for index, row in iptc_codes.iterrows():
72
+ topic_url = row["NewsCode-QCode (flat)"]
73
+ topic_code = topic_url[len("medtop:"):]
74
+ for lev in range(0, 6):
75
+ level = f"Level{lev + 1}/NewsCode"
76
+ level_url = row[level]
77
+ if level_url:
78
+ level_code = level_url[len("medtop:"):]
79
+ levels[lev] = level_code
80
+ break
81
+ path = "_".join(levels[0: lev + 1])
82
+ topics[topic_code] = Mediatopic(
83
+ label=row["Name (en-GB)"], code=topic_code, path=path
84
+ )
85
+ return topics
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyprocessors-iptc_mapper
3
+ Version: 0.5.7
4
+ Summary: Sherpa IPTC category mapper
5
+ Home-page: https://github.com/oterrier/pyprocessors_iptc_mapper/
6
+ Keywords:
7
+ Author: Olivier Terrier
8
+ Author-email: olivier.terrier@kairntech.com
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ Classifier: Intended Audience :: Information Technology
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: System Administrators
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Classifier: Topic :: Software Development :: Libraries
18
+ Classifier: Topic :: Software Development
19
+ Classifier: License :: OSI Approved :: MIT License
20
+ Classifier: Development Status :: 4 - Beta
21
+ Classifier: Programming Language :: Python :: 3.8
22
+ License-File: LICENSE
23
+ Requires-Dist: pymultirole-plugins>=0.5.152,<0.6.0
24
+ Requires-Dist: collections-extended
25
+ Requires-Dist: pandas>=1.2.3,<=1.3.5
26
+ Requires-Dist: openpyxl==3.0.7
27
+ Requires-Dist: log-with-context
28
+ Requires-Dist: flit ; extra == "dev"
29
+ Requires-Dist: pre-commit ; extra == "dev"
30
+ Requires-Dist: bump2version ; extra == "dev"
31
+ Requires-Dist: sphinx ; extra == "docs"
32
+ Requires-Dist: sphinx-rtd-theme ; extra == "docs"
33
+ Requires-Dist: m2r2 ; extra == "docs"
34
+ Requires-Dist: sphinxcontrib.apidoc ; extra == "docs"
35
+ Requires-Dist: jupyter_sphinx ; extra == "docs"
36
+ Requires-Dist: pytest ; extra == "test"
37
+ Requires-Dist: pytest-cov ; extra == "test"
38
+ Requires-Dist: pytest-flake8 ; extra == "test"
39
+ Requires-Dist: pytest-black ; extra == "test"
40
+ Requires-Dist: flake8==3.9.2 ; extra == "test"
41
+ Requires-Dist: tox ; extra == "test"
42
+ Requires-Dist: dirty-equals ; extra == "test"
43
+ Provides-Extra: dev
44
+ Provides-Extra: docs
45
+ Provides-Extra: test
46
+
47
+ # pyprocessors_iptc_mapper
48
+
49
+ [![license](https://img.shields.io/github/license/oterrier/pyprocessors_iptc_mapper)](https://github.com/oterrier/pyprocessors_iptc_mapper/blob/master/LICENSE)
50
+ [![tests](https://github.com/oterrier/pyprocessors_iptc_mapper/workflows/tests/badge.svg)](https://github.com/oterrier/pyprocessors_iptc_mapper/actions?query=workflow%3Atests)
51
+ [![codecov](https://img.shields.io/codecov/c/github/oterrier/pyprocessors_iptc_mapper)](https://codecov.io/gh/oterrier/pyprocessors_iptc_mapper)
52
+ [![docs](https://img.shields.io/readthedocs/pyprocessors_iptc_mapper)](https://pyprocessors_iptc_mapper.readthedocs.io)
53
+ [![version](https://img.shields.io/pypi/v/pyprocessors_iptc_mapper)](https://pypi.org/project/pyprocessors_iptc_mapper/)
54
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pyprocessors_iptc_mapper)](https://pypi.org/project/pyprocessors_iptc_mapper/)
55
+
56
+ CategoriesFromAnnotations annotations coming from different annotators
57
+
58
+ ## Installation
59
+
60
+ You can simply `pip install pyprocessors_iptc_mapper`.
61
+
62
+ ## Developing
63
+
64
+ ### Pre-requesites
65
+
66
+ You will need to install `flit` (for building the package) and `tox` (for orchestrating testing and documentation building):
67
+
68
+ ```
69
+ python3 -m pip install flit tox
70
+ ```
71
+
72
+ Clone the repository:
73
+
74
+ ```
75
+ git clone https://github.com/oterrier/pyprocessors_iptc_mapper
76
+ ```
77
+
78
+ ### Running the test suite
79
+
80
+ You can run the full test suite against all supported versions of Python (3.8) with:
81
+
82
+ ```
83
+ tox
84
+ ```
85
+
86
+ ### Building the documentation
87
+
88
+ You can build the HTML documentation with:
89
+
90
+ ```
91
+ tox -e docs
92
+ ```
93
+
94
+ The built documentation is available at `docs/_build/index.html.
95
+
@@ -0,0 +1,8 @@
1
+ pyprocessors_iptc_mapper/IPTC-MediaTopic-NewsCodes.xlsx,sha256=yM-qPrJHyqEg9zelQsQJfvMpJjvHJt_isR8YdoRHw_4,703544
2
+ pyprocessors_iptc_mapper/__init__.py,sha256=A-4L20hcpPg7KfTApbYFFhXUV4Lc47dsQkUFCz_2RoU,56
3
+ pyprocessors_iptc_mapper/iptc_mapper.py,sha256=5wVZ1XqpIDer6o5w-piW1G_ECt9Mxd4xeOI72NmcLXI,3469
4
+ pyprocessors_iptc_mapper-0.5.7.dist-info/entry_points.txt,sha256=MnrwTzmr_PKkxRlg6MaCfM5oIVFXUa1P-2F0JWxWh4A,84
5
+ pyprocessors_iptc_mapper-0.5.7.dist-info/licenses/LICENSE,sha256=QPypQEap7i1K1jF4X_JOx3MvFyVupJe4-rr6h2DuXME,1082
6
+ pyprocessors_iptc_mapper-0.5.7.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
7
+ pyprocessors_iptc_mapper-0.5.7.dist-info/METADATA,sha256=fGs7459R3Zm8iGnT4uvmBpt811ZrVn26MXTuor_ipy0,3470
8
+ pyprocessors_iptc_mapper-0.5.7.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: flit 3.12.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [pyprocessors.plugins]
2
+ iptc_mapper=pyprocessors_iptc_mapper.iptc_mapper:IPTCMapper
3
+
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Olivier Terrier
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.