pyconvertu 0.4.1__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyconvertu-1.0.0/PKG-INFO +108 -0
- pyconvertu-1.0.0/README.md +90 -0
- pyconvertu-1.0.0/convertu/__init__.py +8 -0
- pyconvertu-1.0.0/convertu/__main__.py +96 -0
- pyconvertu-1.0.0/convertu/convertu.py +198 -0
- pyconvertu-1.0.0/pyconvertu.egg-info/PKG-INFO +108 -0
- pyconvertu-1.0.0/pyconvertu.egg-info/SOURCES.txt +10 -0
- pyconvertu-1.0.0/pyconvertu.egg-info/entry_points.txt +2 -0
- pyconvertu-1.0.0/pyconvertu.egg-info/top_level.txt +1 -0
- pyconvertu-1.0.0/pyproject.toml +46 -0
- pyconvertu-1.0.0/setup.cfg +4 -0
- pyconvertu-0.4.1/LICENSE.txt +0 -7
- pyconvertu-0.4.1/PKG-INFO +0 -104
- pyconvertu-0.4.1/README.md +0 -86
- pyconvertu-0.4.1/pyconvertu/__init__.py +0 -3
- pyconvertu-0.4.1/pyconvertu/classification.json +0 -1
- pyconvertu-0.4.1/pyconvertu/pyconvertu.py +0 -177
- pyconvertu-0.4.1/pyconvertu.egg-info/PKG-INFO +0 -104
- pyconvertu-0.4.1/pyconvertu.egg-info/SOURCES.txt +0 -11
- pyconvertu-0.4.1/pyconvertu.egg-info/top_level.txt +0 -1
- pyconvertu-0.4.1/setup.cfg +0 -7
- pyconvertu-0.4.1/setup.py +0 -35
- {pyconvertu-0.4.1 → pyconvertu-1.0.0}/pyconvertu.egg-info/dependency_links.txt +0 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyconvertu
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: From/to Classification Converter
|
|
5
|
+
Author-email: The Economist <29724411+econcz@users.noreply.github.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/econcz/pyconvertu
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/econcz/pyconvertu/issues
|
|
9
|
+
Keywords: data-science,classification,converter,iso-3166-1,iso-3166-1-alpha-3,iso-3166-1-alpha-2,iso-3166-1-numeric
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# CONVERTU - From/to Classification Converter
|
|
20
|
+
|
|
21
|
+
Tools for creating and converting between classification systems.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install pyconvertu
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick example
|
|
30
|
+
python:
|
|
31
|
+
```python
|
|
32
|
+
from pyconvertu import convert
|
|
33
|
+
|
|
34
|
+
print(convert(to="iso3", text=["Czech Republic", "Slovakia"]))
|
|
35
|
+
```
|
|
36
|
+
bash:
|
|
37
|
+
```bash
|
|
38
|
+
uconv -t iso3 'Czech Republic' 'Slovakia'
|
|
39
|
+
echo -e "Czech Republic\nSlovakia" | uconv -t iso3
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## User Reference
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
convert(
|
|
46
|
+
data=[...], json_file='...', info=False, dump=False,
|
|
47
|
+
to="...", text="..." | ["...", "..."], *args, **kwargs
|
|
48
|
+
)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Convert text into a target classification using a JSON mapping, or return mapping/metadata (info/dump modes).
|
|
52
|
+
|
|
53
|
+
**Parameters:**
|
|
54
|
+
|
|
55
|
+
`data` : *list[dict]*, optional
|
|
56
|
+
A complete classification mapping provided directly as a list of dictionaries. If supplied without `json_file`, this data will be used in-memory for conversions without reading from disk.
|
|
57
|
+
|
|
58
|
+
`json_file` : *str*, optional
|
|
59
|
+
Path to the classification JSON file. If not provided, the default bundled `classification.json` is used. When `data` is not supplied, this file is loaded and used as the source mapping. When `data` is supplied along with `json_file`, the data is written to `json_file`.
|
|
60
|
+
|
|
61
|
+
`info` : *bool*, default = *False*
|
|
62
|
+
If *True*, return only metadata/sources entries. No conversion.
|
|
63
|
+
|
|
64
|
+
`dump` : *bool*, default = *False*
|
|
65
|
+
If *True*, return the full mapping (filtered of metadata/sources). No conversion.
|
|
66
|
+
|
|
67
|
+
`to` : *str*
|
|
68
|
+
Target field name to return from matched records (e.g., "iso3").
|
|
69
|
+
|
|
70
|
+
`text` : *str* | *list[str]*
|
|
71
|
+
One string or a list of strings to convert. A single string input yields a single string output; a list yields a list.
|
|
72
|
+
|
|
73
|
+
**Classification passed via `data`**
|
|
74
|
+
|
|
75
|
+
The JSON must follow the same structure as the bundled classification.json.
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
[
|
|
79
|
+
{
|
|
80
|
+
"regex": "^(.*afgh.*|\\s*AFG\\s*|\\s*AF\\s*|\\s*4\\s*)$",
|
|
81
|
+
"name_en": "Afghanistan",
|
|
82
|
+
"name_fr": "Afghanistan (l')",
|
|
83
|
+
"iso3": "AFG",
|
|
84
|
+
"iso2": "AF",
|
|
85
|
+
"isoN": "4"
|
|
86
|
+
},
|
|
87
|
+
...
|
|
88
|
+
{
|
|
89
|
+
"metadata": {
|
|
90
|
+
"name_en": "English short name",
|
|
91
|
+
"name_fr": "French short name",
|
|
92
|
+
"iso3": "alpha-3 code",
|
|
93
|
+
"iso2": "alpha-2 code",
|
|
94
|
+
"isoN": "numeric"
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"sources": [
|
|
99
|
+
"[https://www.iso.org/iso-3166-country-codes.html](ISO 3166 COUNTRY CODES)",
|
|
100
|
+
"[https://en.wikipedia.org/wiki/List_of_alternative_country_names](ALTERNATIVE NAMES)"
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
MIT License — see the [LICENSE](LICENSE) file.
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# CONVERTU - From/to Classification Converter
|
|
2
|
+
|
|
3
|
+
Tools for creating and converting between classification systems.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install pyconvertu
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick example
|
|
12
|
+
python:
|
|
13
|
+
```python
|
|
14
|
+
from pyconvertu import convert
|
|
15
|
+
|
|
16
|
+
print(convert(to="iso3", text=["Czech Republic", "Slovakia"]))
|
|
17
|
+
```
|
|
18
|
+
bash:
|
|
19
|
+
```bash
|
|
20
|
+
uconv -t iso3 'Czech Republic' 'Slovakia'
|
|
21
|
+
echo -e "Czech Republic\nSlovakia" | uconv -t iso3
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## User Reference
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
convert(
|
|
28
|
+
data=[...], json_file='...', info=False, dump=False,
|
|
29
|
+
to="...", text="..." | ["...", "..."], *args, **kwargs
|
|
30
|
+
)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Convert text into a target classification using a JSON mapping, or return mapping/metadata (info/dump modes).
|
|
34
|
+
|
|
35
|
+
**Parameters:**
|
|
36
|
+
|
|
37
|
+
`data` : *list[dict]*, optional
|
|
38
|
+
A complete classification mapping provided directly as a list of dictionaries. If supplied without `json_file`, this data will be used in-memory for conversions without reading from disk.
|
|
39
|
+
|
|
40
|
+
`json_file` : *str*, optional
|
|
41
|
+
Path to the classification JSON file. If not provided, the default bundled `classification.json` is used. When `data` is not supplied, this file is loaded and used as the source mapping. When `data` is supplied along with `json_file`, the data is written to `json_file`.
|
|
42
|
+
|
|
43
|
+
`info` : *bool*, default = *False*
|
|
44
|
+
If *True*, return only metadata/sources entries. No conversion.
|
|
45
|
+
|
|
46
|
+
`dump` : *bool*, default = *False*
|
|
47
|
+
If *True*, return the full mapping (filtered of metadata/sources). No conversion.
|
|
48
|
+
|
|
49
|
+
`to` : *str*
|
|
50
|
+
Target field name to return from matched records (e.g., "iso3").
|
|
51
|
+
|
|
52
|
+
`text` : *str* | *list[str]*
|
|
53
|
+
One string or a list of strings to convert. A single string input yields a single string output; a list yields a list.
|
|
54
|
+
|
|
55
|
+
**Classification passed via `data`**
|
|
56
|
+
|
|
57
|
+
The JSON must follow the same structure as the bundled classification.json.
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
[
|
|
61
|
+
{
|
|
62
|
+
"regex": "^(.*afgh.*|\\s*AFG\\s*|\\s*AF\\s*|\\s*4\\s*)$",
|
|
63
|
+
"name_en": "Afghanistan",
|
|
64
|
+
"name_fr": "Afghanistan (l')",
|
|
65
|
+
"iso3": "AFG",
|
|
66
|
+
"iso2": "AF",
|
|
67
|
+
"isoN": "4"
|
|
68
|
+
},
|
|
69
|
+
...
|
|
70
|
+
{
|
|
71
|
+
"metadata": {
|
|
72
|
+
"name_en": "English short name",
|
|
73
|
+
"name_fr": "French short name",
|
|
74
|
+
"iso3": "alpha-3 code",
|
|
75
|
+
"iso2": "alpha-2 code",
|
|
76
|
+
"isoN": "numeric"
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"sources": [
|
|
81
|
+
"[https://www.iso.org/iso-3166-country-codes.html](ISO 3166 COUNTRY CODES)",
|
|
82
|
+
"[https://en.wikipedia.org/wiki/List_of_alternative_country_names](ALTERNATIVE NAMES)"
|
|
83
|
+
]
|
|
84
|
+
}
|
|
85
|
+
]
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## License
|
|
89
|
+
|
|
90
|
+
MIT License — see the [LICENSE](LICENSE) file.
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import signal
|
|
3
|
+
import sys
|
|
4
|
+
import argparse
|
|
5
|
+
from pyconvertu import convert, __version__
|
|
6
|
+
from typing import Any
|
|
7
|
+
from json import dumps
|
|
8
|
+
|
|
9
|
+
def _print_human_readable(obj: Any) -> None:
|
|
10
|
+
"""
|
|
11
|
+
Pretty-print JSON-like Python objects with UTF-8 characters preserved.
|
|
12
|
+
"""
|
|
13
|
+
print(dumps(obj, ensure_ascii=False, indent=2))
|
|
14
|
+
|
|
15
|
+
def main() -> int:
|
|
16
|
+
# make SIGPIPE behave like a normal EOF on Unix
|
|
17
|
+
if hasattr(signal, "SIGPIPE"):
|
|
18
|
+
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
|
|
19
|
+
|
|
20
|
+
# prefer UTF-8 on stdout for consistent output
|
|
21
|
+
try:
|
|
22
|
+
sys.stdout.reconfigure(encoding="utf-8")
|
|
23
|
+
except Exception:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
parser = argparse.ArgumentParser(
|
|
27
|
+
description="Convert from/to the desired classification."
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"-v", "--version", action="version",
|
|
31
|
+
version=f"%(prog)s {__version__}",
|
|
32
|
+
help="Show version and exit."
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"-s", "--source", dest="source", default=None, metavar="PATH",
|
|
36
|
+
help="Path to the classification JSON file (default: bundled file)."
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"-t", "--to", dest="to", default=None, metavar="FIELD",
|
|
40
|
+
help="E.g., iso3, iso2, name_en (Required unless --info/--dump)."
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--info", action="store_true",
|
|
44
|
+
help="Show metadata and sources in human-readable form."
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--dump", action="store_true",
|
|
48
|
+
help="Show the classification records in human-readable form."
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"text", nargs="*",
|
|
52
|
+
help="Input text(s) to convert (e.g., uconv -t iso3 'Czech Republic')."
|
|
53
|
+
)
|
|
54
|
+
try:
|
|
55
|
+
args = parser.parse_args()
|
|
56
|
+
if args.info:
|
|
57
|
+
result = convert(json_file=args.source, info=True)
|
|
58
|
+
_print_human_readable(result)
|
|
59
|
+
return 0
|
|
60
|
+
if args.dump:
|
|
61
|
+
result = convert(json_file=args.source, dump=True)
|
|
62
|
+
_print_human_readable(result)
|
|
63
|
+
return 0
|
|
64
|
+
if not args.to:
|
|
65
|
+
print("error: --to/-t is required unless using --info or --dump",
|
|
66
|
+
file=sys.stderr)
|
|
67
|
+
return 1
|
|
68
|
+
|
|
69
|
+
# if no positional args, try reading from stdin (one item per line)
|
|
70
|
+
inputs = list(args.text)
|
|
71
|
+
if not inputs and not sys.stdin.isatty():
|
|
72
|
+
inputs = [line.strip() for line in sys.stdin if line.strip()]
|
|
73
|
+
if not inputs:
|
|
74
|
+
print("error: provide at least one input text (args or STDIN)",
|
|
75
|
+
file=sys.stderr)
|
|
76
|
+
return 1
|
|
77
|
+
|
|
78
|
+
# preserve shape: single token -> str, multiple -> list[str]
|
|
79
|
+
text_in: str | list[str] = inputs[0] if len(inputs) == 1 else inputs
|
|
80
|
+
result = convert(json_file=args.source, to=args.to, text=text_in)
|
|
81
|
+
if isinstance(result, list):
|
|
82
|
+
for item in result:
|
|
83
|
+
print(item)
|
|
84
|
+
else:
|
|
85
|
+
print(result)
|
|
86
|
+
return 0
|
|
87
|
+
except BrokenPipeError:
|
|
88
|
+
try:
|
|
89
|
+
sys.stdout.close()
|
|
90
|
+
finally:
|
|
91
|
+
return 0
|
|
92
|
+
except KeyboardInterrupt:
|
|
93
|
+
return 130
|
|
94
|
+
|
|
95
|
+
if __name__ == "__main__":
|
|
96
|
+
sys.exit(main())
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from os import path
|
|
3
|
+
from re import compile, Pattern, I, M, error as RegexError
|
|
4
|
+
from typing import Any
|
|
5
|
+
from json import load, dump as save, JSONDecodeError
|
|
6
|
+
|
|
7
|
+
class ConvertUError(Exception):
|
|
8
|
+
"""
|
|
9
|
+
Exception class for ConvertU-related errors.
|
|
10
|
+
|
|
11
|
+
Raised when an error occurs during classification conversion,
|
|
12
|
+
metadata retrieval, or parsing of external JSON files.
|
|
13
|
+
Provides structured messaging and optional diagnostic tagging.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
message : str, optional
|
|
18
|
+
Human-readable description of the error. Defaults to a generic message.
|
|
19
|
+
|
|
20
|
+
code : int or str, optional
|
|
21
|
+
Optional error code or identifier for structured handling.
|
|
22
|
+
|
|
23
|
+
Usage
|
|
24
|
+
-----
|
|
25
|
+
raise ConvertUError("Unsupported classification key", code=400)
|
|
26
|
+
"""
|
|
27
|
+
def __init__(self, message: str = "An error occurred in convertu",
|
|
28
|
+
code: int | str | None = None):
|
|
29
|
+
self.message = message
|
|
30
|
+
self.code = code
|
|
31
|
+
full_message = f"{message} (Code: {code})" if code is not None \
|
|
32
|
+
else message
|
|
33
|
+
super().__init__(full_message)
|
|
34
|
+
|
|
35
|
+
def __str__(self) -> str:
|
|
36
|
+
return self.message if self.code is None \
|
|
37
|
+
else f"{self.message} [Code: {self.code}]"
|
|
38
|
+
|
|
39
|
+
def as_dict(self) -> dict:
|
|
40
|
+
return {"error": self.message, "code": self.code}
|
|
41
|
+
|
|
42
|
+
def _default_json_path() -> str:
|
|
43
|
+
"""
|
|
44
|
+
Resolve the default path to the bundled `classification.json` file.
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
base = sys.modules['pyconvertu'].__file__
|
|
48
|
+
except KeyError:
|
|
49
|
+
base = __file__
|
|
50
|
+
return path.join(path.dirname(base), r'classification.json')
|
|
51
|
+
|
|
52
|
+
def _validate_data(obj: list[dict[str, Any]]) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Validate that `obj` conforms to the required structure:
|
|
55
|
+
- at least one dict has key 'regex' (value is a str );
|
|
56
|
+
- exactly one dict has key 'metadata' (value is a dict);
|
|
57
|
+
- exactly one dict has key 'sources' (value is a list).
|
|
58
|
+
"""
|
|
59
|
+
if not isinstance(obj, list) or not all(isinstance(d, dict) for d in obj):
|
|
60
|
+
raise ConvertUError("obj must be a list[dict]")
|
|
61
|
+
if not any(isinstance(d.get("regex"), str) and d.get("regex", "").strip()
|
|
62
|
+
for d in obj):
|
|
63
|
+
raise ConvertUError("obj must include at least one entry with "
|
|
64
|
+
"a non-empty 'regex' string")
|
|
65
|
+
if sum(1 for d in obj if "metadata" in d and
|
|
66
|
+
isinstance(d["metadata"], dict) and
|
|
67
|
+
d["metadata"]) != 1:
|
|
68
|
+
raise ConvertUError("obj must include exactly one non-empty "
|
|
69
|
+
"'metadata' dict")
|
|
70
|
+
if sum(1 for d in obj if "sources" in d and
|
|
71
|
+
isinstance(d["sources"], list) and
|
|
72
|
+
d["sources"]) != 1:
|
|
73
|
+
raise ConvertUError("obj must include exactly one non-empty "
|
|
74
|
+
"'sources' list")
|
|
75
|
+
|
|
76
|
+
def convert(
|
|
77
|
+
data: list[dict] | None = None, json_file: str | None = None,
|
|
78
|
+
info: bool = False, dump: bool = False,
|
|
79
|
+
to: str = '', text: str | list[str] | None = None,
|
|
80
|
+
*args, **kwargs
|
|
81
|
+
) -> str | list[str] | list[dict]:
|
|
82
|
+
"""
|
|
83
|
+
Convert text into a target classification using a JSON mapping, or
|
|
84
|
+
return mapping/metadata (info/dump modes).
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
data : list[dict], optional
|
|
89
|
+
A complete classification mapping provided directly as a list of
|
|
90
|
+
dictionaries. If supplied without `json_file`, this data will be
|
|
91
|
+
used in-memory for conversions without reading from disk.
|
|
92
|
+
|
|
93
|
+
json_file : str, optional
|
|
94
|
+
Path to the classification JSON file. If not provided, the default
|
|
95
|
+
bundled `classification.json` is used. When `data` is not supplied,
|
|
96
|
+
this file is loaded and used as the source mapping. When `data` is
|
|
97
|
+
supplied along with `json_file`, the data is written to `json_file`.
|
|
98
|
+
|
|
99
|
+
info : bool, default = False
|
|
100
|
+
If True, return only metadata/sources entries. No conversion.
|
|
101
|
+
|
|
102
|
+
dump : bool, default = False
|
|
103
|
+
If True, return the full mapping (filtered of metadata/sources).
|
|
104
|
+
No conversion.
|
|
105
|
+
|
|
106
|
+
to : str
|
|
107
|
+
Target field name to return from matched records (e.g., "iso3").
|
|
108
|
+
|
|
109
|
+
text : str | list[str]
|
|
110
|
+
One string or a list of strings to convert. A single string input
|
|
111
|
+
yields a single string output; a list yields a list.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
str | list[str] | list[dict]
|
|
116
|
+
- If `info=True`: list of metadata/sources dicts.
|
|
117
|
+
- If `dump=True`: list of mapping dicts (no metadata/sources).
|
|
118
|
+
- Otherwise: converted string(s). If no match is found, the original
|
|
119
|
+
value is returned. If `text` is None, returns an empty list.
|
|
120
|
+
"""
|
|
121
|
+
# retrieve the metadata/sources and classification
|
|
122
|
+
if data is not None: # read from the argument
|
|
123
|
+
_validate_data(data)
|
|
124
|
+
if json_file is not None:
|
|
125
|
+
json_file = path.expanduser(json_file)
|
|
126
|
+
try:
|
|
127
|
+
with open(json_file, "w", encoding="utf-8") as f:
|
|
128
|
+
save(data, f, ensure_ascii=False, indent=2)
|
|
129
|
+
except OSError as e:
|
|
130
|
+
raise ConvertUError(f"Unable to write to {json_file}: {e}")
|
|
131
|
+
return json_file
|
|
132
|
+
else: # read from the file
|
|
133
|
+
if json_file is None:
|
|
134
|
+
json_file = path.expanduser(_default_json_path())
|
|
135
|
+
if not path.isfile(json_file):
|
|
136
|
+
raise ConvertUError(f"Classification file not found: {json_file}")
|
|
137
|
+
try:
|
|
138
|
+
with open(json_file, encoding="utf-8") as f:
|
|
139
|
+
data = load(f)
|
|
140
|
+
except JSONDecodeError as e:
|
|
141
|
+
raise ConvertUError(f"Invalid JSON in {json_file}: {e}")
|
|
142
|
+
except OSError as e:
|
|
143
|
+
raise ConvertUError(f"Unable to read {json_file}: {e}")
|
|
144
|
+
if not isinstance(data, list):
|
|
145
|
+
raise ConvertUError("Mapping JSON must be a list of objects")
|
|
146
|
+
metadata = [
|
|
147
|
+
d for d in data
|
|
148
|
+
if isinstance(d, dict) and ('metadata' in d or
|
|
149
|
+
'sources' in d)
|
|
150
|
+
]
|
|
151
|
+
classification: list[dict[str, Any]] = [
|
|
152
|
+
d for d in data
|
|
153
|
+
if isinstance(d, dict) and ('metadata' not in d and
|
|
154
|
+
'sources' not in d)
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
# return metadata/sources or classification
|
|
158
|
+
if info:
|
|
159
|
+
return metadata # return metadata
|
|
160
|
+
if dump:
|
|
161
|
+
return classification # return classification
|
|
162
|
+
|
|
163
|
+
# process arguments
|
|
164
|
+
if text is None:
|
|
165
|
+
items: list[str] = []
|
|
166
|
+
single_input = False
|
|
167
|
+
elif isinstance(text, str):
|
|
168
|
+
items = [text]
|
|
169
|
+
single_input = True
|
|
170
|
+
elif isinstance(text, list) and all(isinstance(s, str) for s in text):
|
|
171
|
+
items = text
|
|
172
|
+
single_input = False
|
|
173
|
+
else:
|
|
174
|
+
raise ConvertUError("text must be str, list[str], or None")
|
|
175
|
+
|
|
176
|
+
# precompile regex patterns once
|
|
177
|
+
compiled: list[tuple[Pattern[str], dict[str, Any]]] = []
|
|
178
|
+
for r in classification:
|
|
179
|
+
p = r.get('regex')
|
|
180
|
+
if to in r and isinstance(p, str) and p:
|
|
181
|
+
try:
|
|
182
|
+
compiled.append((compile(p, I | M), r))
|
|
183
|
+
except RegexError:
|
|
184
|
+
continue
|
|
185
|
+
if items and not compiled:
|
|
186
|
+
return text if single_input else items
|
|
187
|
+
|
|
188
|
+
# convert compiled
|
|
189
|
+
def convert_one(s: str) -> str:
|
|
190
|
+
s = str(s)
|
|
191
|
+
for p, r in compiled:
|
|
192
|
+
if p.search(s):
|
|
193
|
+
val = r.get(to)
|
|
194
|
+
return s if val is None else val # return converted text
|
|
195
|
+
return s # return original text
|
|
196
|
+
|
|
197
|
+
result = [convert_one(s) for s in items]
|
|
198
|
+
return result[0] if single_input else result
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyconvertu
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: From/to Classification Converter
|
|
5
|
+
Author-email: The Economist <29724411+econcz@users.noreply.github.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/econcz/pyconvertu
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/econcz/pyconvertu/issues
|
|
9
|
+
Keywords: data-science,classification,converter,iso-3166-1,iso-3166-1-alpha-3,iso-3166-1-alpha-2,iso-3166-1-numeric
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# CONVERTU - From/to Classification Converter
|
|
20
|
+
|
|
21
|
+
Tools for creating and converting between classification systems.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install pyconvertu
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick example
|
|
30
|
+
python:
|
|
31
|
+
```python
|
|
32
|
+
from pyconvertu import convert
|
|
33
|
+
|
|
34
|
+
print(convert(to="iso3", text=["Czech Republic", "Slovakia"]))
|
|
35
|
+
```
|
|
36
|
+
bash:
|
|
37
|
+
```bash
|
|
38
|
+
uconv -t iso3 'Czech Republic' 'Slovakia'
|
|
39
|
+
echo -e "Czech Republic\nSlovakia" | uconv -t iso3
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## User Reference
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
convert(
|
|
46
|
+
data=[...], json_file='...', info=False, dump=False,
|
|
47
|
+
to="...", text="..." | ["...", "..."], *args, **kwargs
|
|
48
|
+
)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Convert text into a target classification using a JSON mapping, or return mapping/metadata (info/dump modes).
|
|
52
|
+
|
|
53
|
+
**Parameters:**
|
|
54
|
+
|
|
55
|
+
`data` : *list[dict]*, optional
|
|
56
|
+
A complete classification mapping provided directly as a list of dictionaries. If supplied without `json_file`, this data will be used in-memory for conversions without reading from disk.
|
|
57
|
+
|
|
58
|
+
`json_file` : *str*, optional
|
|
59
|
+
Path to the classification JSON file. If not provided, the default bundled `classification.json` is used. When `data` is not supplied, this file is loaded and used as the source mapping. When `data` is supplied along with `json_file`, the data is written to `json_file`.
|
|
60
|
+
|
|
61
|
+
`info` : *bool*, default = *False*
|
|
62
|
+
If *True*, return only metadata/sources entries. No conversion.
|
|
63
|
+
|
|
64
|
+
`dump` : *bool*, default = *False*
|
|
65
|
+
If *True*, return the full mapping (filtered of metadata/sources). No conversion.
|
|
66
|
+
|
|
67
|
+
`to` : *str*
|
|
68
|
+
Target field name to return from matched records (e.g., "iso3").
|
|
69
|
+
|
|
70
|
+
`text` : *str* | *list[str]*
|
|
71
|
+
One string or a list of strings to convert. A single string input yields a single string output; a list yields a list.
|
|
72
|
+
|
|
73
|
+
**Classification passed via `data`**
|
|
74
|
+
|
|
75
|
+
The JSON must follow the same structure as the bundled classification.json.
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
[
|
|
79
|
+
{
|
|
80
|
+
"regex": "^(.*afgh.*|\\s*AFG\\s*|\\s*AF\\s*|\\s*4\\s*)$",
|
|
81
|
+
"name_en": "Afghanistan",
|
|
82
|
+
"name_fr": "Afghanistan (l')",
|
|
83
|
+
"iso3": "AFG",
|
|
84
|
+
"iso2": "AF",
|
|
85
|
+
"isoN": "4"
|
|
86
|
+
},
|
|
87
|
+
...
|
|
88
|
+
{
|
|
89
|
+
"metadata": {
|
|
90
|
+
"name_en": "English short name",
|
|
91
|
+
"name_fr": "French short name",
|
|
92
|
+
"iso3": "alpha-3 code",
|
|
93
|
+
"iso2": "alpha-2 code",
|
|
94
|
+
"isoN": "numeric"
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"sources": [
|
|
99
|
+
"[https://www.iso.org/iso-3166-country-codes.html](ISO 3166 COUNTRY CODES)",
|
|
100
|
+
"[https://en.wikipedia.org/wiki/List_of_alternative_country_names](ALTERNATIVE NAMES)"
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
MIT License — see the [LICENSE](LICENSE) file.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
convertu/__init__.py
|
|
4
|
+
convertu/__main__.py
|
|
5
|
+
convertu/convertu.py
|
|
6
|
+
pyconvertu.egg-info/PKG-INFO
|
|
7
|
+
pyconvertu.egg-info/SOURCES.txt
|
|
8
|
+
pyconvertu.egg-info/dependency_links.txt
|
|
9
|
+
pyconvertu.egg-info/entry_points.txt
|
|
10
|
+
pyconvertu.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
convertu
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pyconvertu"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "From/to Classification Converter"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
license-files = ["LICENSE"]
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "The Economist", email = "29724411+econcz@users.noreply.github.com" }
|
|
11
|
+
]
|
|
12
|
+
keywords = [
|
|
13
|
+
"data-science",
|
|
14
|
+
"classification",
|
|
15
|
+
"converter",
|
|
16
|
+
"iso-3166-1",
|
|
17
|
+
"iso-3166-1-alpha-3",
|
|
18
|
+
"iso-3166-1-alpha-2",
|
|
19
|
+
"iso-3166-1-numeric"
|
|
20
|
+
]
|
|
21
|
+
classifiers = [
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Operating System :: OS Independent",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Information Analysis"
|
|
28
|
+
]
|
|
29
|
+
dependencies = []
|
|
30
|
+
|
|
31
|
+
[project.scripts]
|
|
32
|
+
uconv = "pyconvertu.__main__:main"
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/econcz/pyconvertu"
|
|
36
|
+
"Bug Tracker" = "https://github.com/econcz/pyconvertu/issues"
|
|
37
|
+
|
|
38
|
+
[build-system]
|
|
39
|
+
requires = ["setuptools>=77", "wheel"]
|
|
40
|
+
build-backend = "setuptools.build_meta"
|
|
41
|
+
|
|
42
|
+
[tool.setuptools]
|
|
43
|
+
packages = ["convertu"]
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.package-data]
|
|
46
|
+
pyconvertu = ["classification.json"]
|
pyconvertu-0.4.1/LICENSE.txt
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
Copyright 2021 econcz
|
|
2
|
-
|
|
3
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
-
|
|
5
|
-
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
-
|
|
7
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|