jpreprocess 0.1.6__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jpreprocess/__init__.py +33 -0
- jpreprocess/dictionary.py +51 -0
- jpreprocess/jpreprocess.cp313-win_amd64.pyd +0 -0
- jpreprocess/jpreprocess.pyi +94 -0
- jpreprocess/py.typed +0 -0
- jpreprocess-0.1.6.dist-info/METADATA +17 -0
- jpreprocess-0.1.6.dist-info/RECORD +9 -0
- jpreprocess-0.1.6.dist-info/WHEEL +4 -0
- jpreprocess-0.1.6.dist-info/sboms/jpreprocess-python.cyclonedx.json +6388 -0
jpreprocess/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from .jpreprocess import (
|
|
3
|
+
__version__,
|
|
4
|
+
JPreprocess,
|
|
5
|
+
build_dictionary,
|
|
6
|
+
JPREPROCESS_VERSION,
|
|
7
|
+
)
|
|
8
|
+
from .dictionary import download_dictionary, dictionary_path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"JPreprocess",
|
|
13
|
+
"build_dictionary",
|
|
14
|
+
"download_dictionary",
|
|
15
|
+
"JPREPROCESS_VERSION",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def jpreprocess(dictionary_version: str = f"v{JPREPROCESS_VERSION}", user_dictionary: str | None = None) -> JPreprocess:
|
|
20
|
+
"""
|
|
21
|
+
Create jpreprocess instance with naist-jdic dictionary.
|
|
22
|
+
|
|
23
|
+
If the system dictionary is not present, this function will download it.
|
|
24
|
+
|
|
25
|
+
Arguments:
|
|
26
|
+
- dictionary_version (str): Version of dictionary to download.
|
|
27
|
+
We don't recommend specifying this argument unless you are aware of what you are doing.
|
|
28
|
+
- user_dictionary (str | None): Path to user dictionary. The extionsion must be ".csv" or ".bin".
|
|
29
|
+
"""
|
|
30
|
+
dict_path = dictionary_path(dictionary_version)
|
|
31
|
+
if not os.path.exists(dict_path):
|
|
32
|
+
download_dictionary(dictionary_version)
|
|
33
|
+
return JPreprocess(dict_path, user_dictionary)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from contextlib import ExitStack
|
|
2
|
+
import sys
|
|
3
|
+
import atexit
|
|
4
|
+
|
|
5
|
+
# from https://github.com/r9y9/pyopenjtalk/pull/74
|
|
6
|
+
|
|
7
|
+
if sys.version_info >= (3, 9):
|
|
8
|
+
from importlib.resources import as_file, files
|
|
9
|
+
else:
|
|
10
|
+
from importlib_resources import as_file, files
|
|
11
|
+
|
|
12
|
+
_file_manager = ExitStack()
|
|
13
|
+
atexit.register(_file_manager.close)
|
|
14
|
+
_file_ref = files(__package__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def dictionary_path(version: str) -> str:
|
|
18
|
+
return str(_file_manager.enter_context(
|
|
19
|
+
as_file(_file_ref / version / "naist-jdic")))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def download_dictionary(version: str) -> str:
|
|
23
|
+
from urllib.request import urlopen
|
|
24
|
+
import tarfile
|
|
25
|
+
import tempfile
|
|
26
|
+
|
|
27
|
+
if version == "latest":
|
|
28
|
+
url = f"https://github.com/jpreprocess/jpreprocess/releases/latest/download/naist-jdic-jpreprocess.tar.gz"
|
|
29
|
+
else:
|
|
30
|
+
url = f"https://github.com/jpreprocess/jpreprocess/releases/download/{version}/naist-jdic-jpreprocess.tar.gz"
|
|
31
|
+
|
|
32
|
+
target_dir = _file_ref / version
|
|
33
|
+
|
|
34
|
+
with tempfile.TemporaryFile() as file:
|
|
35
|
+
print('Downloading: "{}"'.format(url))
|
|
36
|
+
with urlopen(url) as response:
|
|
37
|
+
try:
|
|
38
|
+
from tqdm.auto import tqdm
|
|
39
|
+
with tqdm.wrapattr(file, "write", total=getattr(response, "length", None)) as tar:
|
|
40
|
+
for chunk in response:
|
|
41
|
+
tar.write(chunk)
|
|
42
|
+
except ImportError:
|
|
43
|
+
for chunk in response:
|
|
44
|
+
file.write(chunk)
|
|
45
|
+
file.seek(0)
|
|
46
|
+
print("Extracting tar file")
|
|
47
|
+
with tarfile.open(mode="r|gz", fileobj=file) as f, as_file(target_dir) as dir:
|
|
48
|
+
f.extractall(path=dir)
|
|
49
|
+
print("done")
|
|
50
|
+
|
|
51
|
+
return str(target_dir)
|
|
Binary file
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import TypedDict, Literal, overload
|
|
2
|
+
|
|
3
|
+
__version__: str
|
|
4
|
+
JPREPROCESS_VERSION: str
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class NjdObject(TypedDict):
|
|
8
|
+
string: str
|
|
9
|
+
pos: str
|
|
10
|
+
pos_group1: str
|
|
11
|
+
pos_group2: str
|
|
12
|
+
pos_group3: str
|
|
13
|
+
ctype: str
|
|
14
|
+
cform: str
|
|
15
|
+
orig: str
|
|
16
|
+
read: str
|
|
17
|
+
pron: str
|
|
18
|
+
acc: int
|
|
19
|
+
mora_size: int
|
|
20
|
+
chain_rule: str
|
|
21
|
+
chain_flag: int
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class JPreprocess:
|
|
25
|
+
def __init__(self, dictionary: str, user_dictionary: str |
|
|
26
|
+
None = None) -> None: ...
|
|
27
|
+
|
|
28
|
+
def run_frontend(self, text: str) -> list[NjdObject]:
|
|
29
|
+
"""
|
|
30
|
+
Run text-processing frontend.
|
|
31
|
+
|
|
32
|
+
Arguments:
|
|
33
|
+
- text (str): Input japanese text.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
- list[NjdObject]: list of NJDNode(s).
|
|
37
|
+
"""
|
|
38
|
+
def make_label(self, njd_features: list[NjdObject]) -> list[str]:
|
|
39
|
+
"""
|
|
40
|
+
Make full-context label using NjdObject
|
|
41
|
+
|
|
42
|
+
Arguments:
|
|
43
|
+
- njd_features (list[NjdObject]): list of NJDNode(s).
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
- list[str]: list of full-context labels.
|
|
47
|
+
"""
|
|
48
|
+
def extract_fullcontext(self, text: str) -> list[str]:
|
|
49
|
+
"""
|
|
50
|
+
Extract full-context label from the input text.
|
|
51
|
+
|
|
52
|
+
Arguments:
|
|
53
|
+
- text (str): Input japanese text.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
- list[str]: list of full-context labels.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
@overload
|
|
60
|
+
def g2p(self, text: str, kana: bool = False,
|
|
61
|
+
join: Literal[True] = True) -> str: ...
|
|
62
|
+
|
|
63
|
+
@overload
|
|
64
|
+
def g2p(self, text: str, kana: bool = False, *,
|
|
65
|
+
join: Literal[False]) -> list[str]: ...
|
|
66
|
+
|
|
67
|
+
@overload
|
|
68
|
+
def g2p(self, text: str, kana: bool,
|
|
69
|
+
join: Literal[False]) -> list[str]:
|
|
70
|
+
"""
|
|
71
|
+
Grapheme-to-phoneme (G2P) conversion.
|
|
72
|
+
|
|
73
|
+
Arguments:
|
|
74
|
+
- text (str): Input japanese text.
|
|
75
|
+
- kana (bool): Whether to generate alphabetical phoneme (False) or kana (True).
|
|
76
|
+
- join (bool): Whether to generate a list of phoneme or kana (False) or join the output by delimiter (True).
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
- list[str] (when join = False): list of phoneme (when kana = False) or kana (when kana = True).
|
|
80
|
+
- str (when join = True): list of phoneme joined with space (when kana = False) or kana joined with empty string (when kana = True).
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def build_dictionary(input: str, output: str, user: bool = False,
|
|
85
|
+
serializer: Literal["jpreprocess", "lindera"] = "jpreprocess") -> None:
|
|
86
|
+
"""
|
|
87
|
+
Build dictionary binary file(s).
|
|
88
|
+
|
|
89
|
+
Arguments:
|
|
90
|
+
- input (str): Path to source directory (system dictionary) or file (user dictionary).
|
|
91
|
+
- output (str): Path to destination directory (system dictionary) or file (user dictionary).
|
|
92
|
+
- user (bool): Whether to build system dictionary (False) or user dictionary (True). Default to False.
|
|
93
|
+
- serializer (str): The name of serializer to use ("lindera" or "jpreprocess"). Default to "jpreprocess"
|
|
94
|
+
"""
|
jpreprocess/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jpreprocess
|
|
3
|
+
Version: 0.1.6
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
7
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
8
|
+
Classifier: Topic :: Scientific/Engineering
|
|
9
|
+
Classifier: Topic :: Software Development
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Requires-Dist: importlib-resources ; python_full_version < '3.9'
|
|
13
|
+
Requires-Dist: pytest>=8.0.1 ; extra == 'dev'
|
|
14
|
+
Requires-Dist: tqdm ; extra == 'progress'
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Provides-Extra: progress
|
|
17
|
+
Requires-Python: >=3.8, <3.15
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
jpreprocess/__init__.py,sha256=9M2JDa43Z4iddAPp9hvL-zNNlM21mm3gPIe_VpRuPik,1039
|
|
2
|
+
jpreprocess/dictionary.py,sha256=NcyypwcL3yX-h6-KmiAqd00E-0D5Hc2snlXJJ8NegRM,1637
|
|
3
|
+
jpreprocess/jpreprocess.cp313-win_amd64.pyd,sha256=1B7QVGTmdywHDvKeG44CXFZ7XGa2F3_f6AHQm5S5NP4,3188224
|
|
4
|
+
jpreprocess/jpreprocess.pyi,sha256=Lx6vCk-x7x3B7cjhLZH6-oBdYnyni6o00i-NLxoMLOE,2867
|
|
5
|
+
jpreprocess/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
jpreprocess-0.1.6.dist-info/METADATA,sha256=yiUCuGx0zxZDlrLSKQUV6iJ0DrrYVV1ULqFDiRcSSLA,696
|
|
7
|
+
jpreprocess-0.1.6.dist-info/WHEEL,sha256=z3sDn4xNPtieBDo9mUKkT1e80gbhCuRsQQi1_g6mdQM,97
|
|
8
|
+
jpreprocess-0.1.6.dist-info/sboms/jpreprocess-python.cyclonedx.json,sha256=br7Ex7ZUtRjHlJWJ2d3XF-FPuA15P0zAl3N1owMqNTQ,203162
|
|
9
|
+
jpreprocess-0.1.6.dist-info/RECORD,,
|