pyprocessors-glotlid 1.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ results.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ .python-version
87
+
88
+ # pipenv
89
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
91
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
92
+ # install all needed dependencies.
93
+ #Pipfile.lock
94
+
95
+ # celery beat schedule file
96
+ celerybeat-schedule
97
+
98
+ # SageMath parsed files
99
+ *.sage.py
100
+
101
+ # Environments
102
+ .env
103
+ .venv
104
+ env/
105
+ venv/
106
+ ENV/
107
+ env.bak/
108
+ venv.bak/
109
+
110
+ # Spyder project settings
111
+ .spyderproject
112
+ .spyproject
113
+
114
+ # Rope project settings
115
+ .ropeproject
116
+
117
+ # mkdocs documentation
118
+ /site
119
+
120
+ # mypy
121
+ .mypy_cache/
122
+ .dmypy.json
123
+ dmypy.json
124
+
125
+ # Pyre type checker
126
+ .pyre/
127
+
128
+ # Specific
129
+ .idea/
130
+ .groovylintrc.json
131
+ .emailNotif
132
+ uv.lock
133
+
134
+ # SBOMs
135
+ **/sbom*.json
136
+ **/trivy*.html
137
+ **/audit*.json
@@ -0,0 +1,5 @@
1
+ # Authors
2
+
3
+ Contributors to pyprocessors_glotlid include:
4
+
5
+ + [Olivier Terrier](mailto:olivier.terrier@kairntech.com)
@@ -0,0 +1,7 @@
1
+ # Changelog
2
+ All notable changes to this project will be documented in this file.
3
+
4
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [unreleased]
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Olivier Terrier
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyprocessors-glotlid
3
+ Version: 1.6.3
4
+ Summary: Sherpa Consolidation processor
5
+ Project-URL: Homepage, https://github.com/oterrier/pyprocessors_glotlid/
6
+ Author-email: Olivier Terrier <olivier.terrier@kairntech.com>
7
+ License: The MIT License (MIT)
8
+
9
+ Copyright (c) 2021 Olivier Terrier
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in
19
+ all copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
27
+ THE SOFTWARE.
28
+ License-File: AUTHORS.md
29
+ License-File: LICENSE
30
+ Classifier: Development Status :: 4 - Beta
31
+ Classifier: Intended Audience :: Developers
32
+ Classifier: Intended Audience :: Information Technology
33
+ Classifier: Intended Audience :: System Administrators
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Operating System :: OS Independent
36
+ Classifier: Programming Language :: Python :: 3.12
37
+ Classifier: Topic :: Software Development
38
+ Classifier: Topic :: Software Development :: Libraries
39
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
40
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
41
+ Requires-Python: >=3.12
42
+ Requires-Dist: fasttext-numpy2-wheel
43
+ Requires-Dist: huggingface-hub
44
+ Requires-Dist: iso639-lang
45
+ Requires-Dist: log-with-context
46
+ Requires-Dist: pymultirole-plugins<1.7.0,>=1.6.0
47
+ Provides-Extra: dev
48
+ Requires-Dist: bump2version; extra == 'dev'
49
+ Requires-Dist: pre-commit; extra == 'dev'
50
+ Provides-Extra: docs
51
+ Requires-Dist: lxml-html-clean; extra == 'docs'
52
+ Requires-Dist: m2r2; extra == 'docs'
53
+ Requires-Dist: sphinx; extra == 'docs'
54
+ Requires-Dist: sphinx-rtd-theme; extra == 'docs'
55
+ Requires-Dist: sphinxcontrib-apidoc; extra == 'docs'
56
+ Provides-Extra: sbom
57
+ Requires-Dist: cyclonedx-bom; extra == 'sbom'
58
+ Requires-Dist: pip-audit; extra == 'sbom'
59
+ Provides-Extra: test
60
+ Requires-Dist: dirty-equals; extra == 'test'
61
+ Requires-Dist: pip; extra == 'test'
62
+ Requires-Dist: pytest; extra == 'test'
63
+ Requires-Dist: pytest-cov; extra == 'test'
64
+ Requires-Dist: ruff; extra == 'test'
65
+ Description-Content-Type: text/markdown
66
+
67
+ # pyprocessors_glotlid
68
+
69
+ [![license](https://img.shields.io/github/license/oterrier/pyprocessors_glotlid)](https://github.com/oterrier/pyprocessors_glotlid/blob/master/LICENSE)
70
+ [![tests](https://github.com/oterrier/pyprocessors_glotlid/workflows/tests/badge.svg)](https://github.com/oterrier/pyprocessors_glotlid/actions?query=workflow%3Atests)
71
+ [![codecov](https://img.shields.io/codecov/c/github/oterrier/pyprocessors_glotlid)](https://codecov.io/gh/oterrier/pyprocessors_glotlid)
72
+ [![docs](https://img.shields.io/readthedocs/pyprocessors_glotlid)](https://pyprocessors_glotlid.readthedocs.io)
73
+ [![version](https://img.shields.io/pypi/v/pyprocessors_glotlid)](https://pypi.org/project/pyprocessors_glotlid/)
74
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pyprocessors_glotlid)](https://pypi.org/project/pyprocessors_glotlid/)
75
+
76
+ GlotLID annotations coming from different annotators
77
+
78
+ ## Installation
79
+
80
+ You can simply `pip install pyprocessors_glotlid`.
81
+
82
+ ## Developing
83
+
84
+ ### Pre-requisites
85
+
86
+ You will need to install `uv` (for managing dependencies and running tests):
87
+
88
+ ```
89
+ pip install uv
90
+ ```
91
+
92
+ Clone the repository:
93
+
94
+ ```
95
+ git clone https://github.com/oterrier/pyprocessors_glotlid
96
+ ```
97
+
98
+ ### Running the test suite
99
+
100
+ Install dependencies and run the full test suite:
101
+
102
+ ```
103
+ uv sync --extra test
104
+ uv run pytest
105
+ ```
106
+
107
+ ### Linting
108
+
109
+ ```
110
+ uv run ruff check .
111
+ uv run ruff format --check .
112
+ ```
113
+
114
+ ### Building the documentation
115
+
116
+ ```
117
+ uv run --extra docs sphinx-build docs docs/_build
118
+ ```
119
+
120
+ The built documentation is available at `docs/_build/index.html`.
121
+
122
+ ## SBOM & vulnerability check
123
+
124
+ Install the SBOM dependencies:
125
+
126
+ ```
127
+ uv sync --extra sbom
128
+ ```
129
+
130
+ Generate a CycloneDX SBOM from the current environment:
131
+
132
+ ```
133
+ uv run cyclonedx-py environment -o sbom.cdx.json --output-format json
134
+ ```
135
+
136
+ Audit dependencies for known vulnerabilities:
137
+
138
+ ```
139
+ uv run pip-audit --format json --output audit-report.json
140
+ ```
141
+
142
+ To fail on any known vulnerability (useful in CI):
143
+
144
+ ```
145
+ uv run pip-audit --strict
146
+ ```
@@ -0,0 +1,80 @@
1
+ # pyprocessors_glotlid
2
+
3
+ [![license](https://img.shields.io/github/license/oterrier/pyprocessors_glotlid)](https://github.com/oterrier/pyprocessors_glotlid/blob/master/LICENSE)
4
+ [![tests](https://github.com/oterrier/pyprocessors_glotlid/workflows/tests/badge.svg)](https://github.com/oterrier/pyprocessors_glotlid/actions?query=workflow%3Atests)
5
+ [![codecov](https://img.shields.io/codecov/c/github/oterrier/pyprocessors_glotlid)](https://codecov.io/gh/oterrier/pyprocessors_glotlid)
6
+ [![docs](https://img.shields.io/readthedocs/pyprocessors_glotlid)](https://pyprocessors_glotlid.readthedocs.io)
7
+ [![version](https://img.shields.io/pypi/v/pyprocessors_glotlid)](https://pypi.org/project/pyprocessors_glotlid/)
8
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pyprocessors_glotlid)](https://pypi.org/project/pyprocessors_glotlid/)
9
+
10
+ GlotLID annotations coming from different annotators
11
+
12
+ ## Installation
13
+
14
+ You can simply `pip install pyprocessors_glotlid`.
15
+
16
+ ## Developing
17
+
18
+ ### Pre-requisites
19
+
20
+ You will need to install `uv` (for managing dependencies and running tests):
21
+
22
+ ```
23
+ pip install uv
24
+ ```
25
+
26
+ Clone the repository:
27
+
28
+ ```
29
+ git clone https://github.com/oterrier/pyprocessors_glotlid
30
+ ```
31
+
32
+ ### Running the test suite
33
+
34
+ Install dependencies and run the full test suite:
35
+
36
+ ```
37
+ uv sync --extra test
38
+ uv run pytest
39
+ ```
40
+
41
+ ### Linting
42
+
43
+ ```
44
+ uv run ruff check .
45
+ uv run ruff format --check .
46
+ ```
47
+
48
+ ### Building the documentation
49
+
50
+ ```
51
+ uv run --extra docs sphinx-build docs docs/_build
52
+ ```
53
+
54
+ The built documentation is available at `docs/_build/index.html`.
55
+
56
+ ## SBOM & vulnerability check
57
+
58
+ Install the SBOM dependencies:
59
+
60
+ ```
61
+ uv sync --extra sbom
62
+ ```
63
+
64
+ Generate a CycloneDX SBOM from the current environment:
65
+
66
+ ```
67
+ uv run cyclonedx-py environment -o sbom.cdx.json --output-format json
68
+ ```
69
+
70
+ Audit dependencies for known vulnerabilities:
71
+
72
+ ```
73
+ uv run pip-audit --format json --output audit-report.json
74
+ ```
75
+
76
+ To fail on any known vulnerability (useful in CI):
77
+
78
+ ```
79
+ uv run pip-audit --strict
80
+ ```
@@ -0,0 +1,39 @@
1
+ # Release Instructions
2
+
3
+ This document guides a contributor through creating a release of pyprocessors_glotlid.
4
+
5
+ ## Preflight checks
6
+
7
+ ### Ensure all tests pass
8
+
9
+ Locally you can run `tox` to check that all tests pass, and check that tests
10
+ against all supported environments are passing also by checking qsim's
11
+ [GitHub actions](https://github.com/oterrier/pyprocessors_glotlid/actions?query=branch%3Amaster+workflow%3Atests).
12
+
13
+ #### Verify that `AUTHORS.md` is up-to-date
14
+
15
+ The following command shows the number of commits per author since the last
16
+ annotated tag:
17
+ ```
18
+ t=$(git describe --abbrev=0); echo Commits since $t; git shortlog -s $t..
19
+ ```
20
+
21
+ ## Make the release
22
+
23
+ Run
24
+
25
+ ```
26
+ bumpversion release # bump version from .devX to release version
27
+ git push --tags # push tagged release to upstream
28
+ flit publish # publish to PyPI
29
+ ```
30
+
31
+ ## Start work on the next release
32
+
33
+ Run
34
+
35
+ ```
36
+ bumpversion minor
37
+ ```
38
+
39
+ To start work on the next release
@@ -0,0 +1,100 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "pyprocessors-glotlid"
7
+ dynamic = ["version"]
8
+ description = "Sherpa Consolidation processor"
9
+ readme = "README.md"
10
+ license = {file = "LICENSE"}
11
+ authors = [
12
+ {name = "Olivier Terrier", email = "olivier.terrier@kairntech.com"},
13
+ ]
14
+ keywords = []
15
+ classifiers = [
16
+ "Intended Audience :: Information Technology",
17
+ "Intended Audience :: Developers",
18
+ "Intended Audience :: System Administrators",
19
+ "Operating System :: OS Independent",
20
+ "Topic :: Software Development :: Libraries :: Application Frameworks",
21
+ "Topic :: Software Development :: Libraries :: Python Modules",
22
+ "Topic :: Software Development :: Libraries",
23
+ "Topic :: Software Development",
24
+ "License :: OSI Approved :: MIT License",
25
+ "Development Status :: 4 - Beta",
26
+ "Programming Language :: Python :: 3.12",
27
+ ]
28
+ requires-python = ">=3.12"
29
+ dependencies = [
30
+ "pymultirole-plugins>=1.6.0,<1.7.0",
31
+ "fasttext-numpy2-wheel",
32
+ "huggingface_hub",
33
+ "log-with-context",
34
+ "iso639-lang",
35
+ ]
36
+
37
+ [project.urls]
38
+ Homepage = "https://github.com/oterrier/pyprocessors_glotlid/"
39
+
40
+ [project.entry-points."pyprocessors.plugins"]
41
+ glotlid = "pyprocessors_glotlid.glotlid:GlotLIDProcessor"
42
+
43
+ [project.optional-dependencies]
44
+ test = [
45
+ "pytest",
46
+ "pytest-cov",
47
+ "ruff",
48
+ "pip",
49
+ "dirty-equals",
50
+ ]
51
+ docs = [
52
+ "sphinx",
53
+ "sphinx-rtd-theme",
54
+ "m2r2",
55
+ "sphinxcontrib.apidoc",
56
+ "lxml_html_clean",
57
+ ]
58
+ dev = [
59
+ "pre-commit",
60
+ "bump2version",
61
+ ]
62
+
63
+ sbom = ["cyclonedx-bom", "pip-audit"]
64
+
65
+ [tool.hatch.version]
66
+ path = "src/pyprocessors_glotlid/__init__.py"
67
+
68
+ [tool.hatch.build.targets.wheel]
69
+ packages = ["src/pyprocessors_glotlid"]
70
+
71
+ [tool.hatch.build]
72
+ exclude = [
73
+ "/tests",
74
+ "/docs",
75
+ "Jenkinsfile",
76
+ "Dockerfile",
77
+ "bumpversion.py",
78
+ "mypy.ini",
79
+ "hgnc_cache.sqlite",
80
+ "trivy-html-template.tpl",
81
+ "MIGRATION.md",
82
+ ".gitignore",
83
+ ".dockerignore"
84
+ ]
85
+
86
+ [tool.pytest.ini_options]
87
+ addopts = "--durations=5"
88
+ norecursedirs = ["docs"]
89
+
90
+ [tool.ruff]
91
+ line-length = 120
92
+ target-version = "py312"
93
+
94
+ [tool.ruff.lint]
95
+ select = ["E", "W", "F", "I", "B", "C4", "UP", "ARG", "SIM"]
96
+ ignore = ["E501"]
97
+
98
+ [tool.ruff.format]
99
+ quote-style = "double"
100
+ indent-style = "space"
@@ -0,0 +1,3 @@
1
+ """Sherpa Consolidation processor"""
2
+
3
+ __version__ = "1.6.3"
@@ -0,0 +1,61 @@
1
+ from functools import cache
2
+ from typing import cast
3
+
4
+ import fasttext
5
+ from huggingface_hub import hf_hub_download
6
+ from iso639 import Lang
7
+ from iso639.exceptions import InvalidLanguageValue
8
+ from log_with_context import Logger, add_logging_context
9
+ from pydantic import BaseModel
10
+ from pymultirole_plugins.v1.processor import ProcessorBase, ProcessorParameters
11
+ from pymultirole_plugins.v1.schema import Document
12
+
13
+ logger = Logger("pymultirole")
14
+
15
+
16
+ class GlotLIDParameters(ProcessorParameters):
17
+ pass
18
+
19
+
20
+ class GlotLIDProcessor(ProcessorBase):
21
+ """GlotLID processor ."""
22
+
23
+ def process(self, documents: list[Document], parameters: ProcessorParameters) -> list[Document]: # noqa: C901
24
+ params: GlotLIDParameters = cast(GlotLIDParameters, parameters) # noqa: F841
25
+ model = get_model()
26
+ for document in documents:
27
+ with add_logging_context(docid=document.identifier):
28
+ if document.metadata is None:
29
+ document.metadata = {}
30
+ text = (document.text or "").replace("\n", " ").replace("\r", " ").strip()
31
+ if not text:
32
+ continue
33
+ labels, _ = model.predict(text)
34
+ document.metadata["language"] = _to_iso639_1(labels[0])
35
+ return documents
36
+
37
+ @classmethod
38
+ def get_model(cls) -> type[BaseModel]:
39
+ return GlotLIDParameters
40
+
41
+
42
+ def _to_iso639_1(label: str) -> str:
43
+ # GlotLID labels look like "__label__fra_Latn"; keep only the language part.
44
+ code = label.removeprefix("__label__").split("_", 1)[0]
45
+ try:
46
+ pt1 = Lang(code).pt1
47
+ except InvalidLanguageValue:
48
+ return code
49
+ return pt1 or code
50
+
51
+
52
+ @cache
53
+ def get_model(ttl_hash=None):
54
+ del ttl_hash
55
+ # download model and get the model path
56
+ # cache_dir is the path to the folder where the downloaded model will be stored/cached.
57
+ model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir=None)
58
+
59
+ # load the model
60
+ model = fasttext.load_model(model_path)
61
+ return model