tdmelodic-torch 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. tdmelodic_torch-2.0.0/LICENSE +29 -0
  2. tdmelodic_torch-2.0.0/MANIFEST.in +5 -0
  3. tdmelodic_torch-2.0.0/PKG-INFO +88 -0
  4. tdmelodic_torch-2.0.0/README.md +45 -0
  5. tdmelodic_torch-2.0.0/requirements.txt +8 -0
  6. tdmelodic_torch-2.0.0/setup.cfg +4 -0
  7. tdmelodic_torch-2.0.0/setup.py +69 -0
  8. tdmelodic_torch-2.0.0/tdmelodic/__init__.py +11 -0
  9. tdmelodic_torch-2.0.0/tdmelodic/filters/__init__.py +0 -0
  10. tdmelodic_torch-2.0.0/tdmelodic/filters/neologd_patch.py +165 -0
  11. tdmelodic_torch-2.0.0/tdmelodic/filters/neologd_preprocess.py +127 -0
  12. tdmelodic_torch-2.0.0/tdmelodic/filters/neologd_rmdups.py +94 -0
  13. tdmelodic_torch-2.0.0/tdmelodic/filters/postprocess_modify_unigram_cost.py +96 -0
  14. tdmelodic_torch-2.0.0/tdmelodic/filters/yomi/__init__.py +0 -0
  15. tdmelodic_torch-2.0.0/tdmelodic/filters/yomi/basic.py +49 -0
  16. tdmelodic_torch-2.0.0/tdmelodic/filters/yomi/particle_yomi.py +67 -0
  17. tdmelodic_torch-2.0.0/tdmelodic/filters/yomi/wrong_yomi_detection.py +92 -0
  18. tdmelodic_torch-2.0.0/tdmelodic/filters/yomi/yomieval.py +49 -0
  19. tdmelodic_torch-2.0.0/tdmelodic/nn/__init__.py +0 -0
  20. tdmelodic_torch-2.0.0/tdmelodic/nn/convert.py +138 -0
  21. tdmelodic_torch-2.0.0/tdmelodic/nn/convert_dic.py +149 -0
  22. tdmelodic_torch-2.0.0/tdmelodic/nn/inference.py +196 -0
  23. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/__init__.py +0 -0
  24. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/category/__init__.py +0 -0
  25. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/category/list_of_symbols/__init__.py +0 -0
  26. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/category/list_of_symbols/acc_concat.py +13 -0
  27. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/category/list_of_symbols/goshu.py +13 -0
  28. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/category/list_of_symbols/pos_short.py +76 -0
  29. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/category/symbol_map.py +38 -0
  30. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/__init__.py +0 -0
  31. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/accent/__init__.py +0 -0
  32. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/accent/accent_alignment.py +76 -0
  33. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/accent/accent_diff.py +40 -0
  34. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/kana/__init__.py +0 -0
  35. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/kana/hyphen2romaji.py +45 -0
  36. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/kana/kana2roman.py +53 -0
  37. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/kana/kanamap/__init__.py +0 -0
  38. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/kana/kanamap/kanamap_normal.py +343 -0
  39. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/kana/mora_sep.py +70 -0
  40. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/kansuji.py +133 -0
  41. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/japanese/text_normalize.py +45 -0
  42. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/mecab/__init__.py +0 -0
  43. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/mecab/my_mecabrc +63 -0
  44. tdmelodic_torch-2.0.0/tdmelodic/nn/lang/mecab/unidic.py +112 -0
  45. tdmelodic_torch-2.0.0/tdmelodic/nn/loader/__init__.py +0 -0
  46. tdmelodic_torch-2.0.0/tdmelodic/nn/loader/data_loader.py +260 -0
  47. tdmelodic_torch-2.0.0/tdmelodic/nn/loader/data_loader_base.py +51 -0
  48. tdmelodic_torch-2.0.0/tdmelodic/nn/model/__init__.py +0 -0
  49. tdmelodic_torch-2.0.0/tdmelodic/nn/model/decode_accent.py +40 -0
  50. tdmelodic_torch-2.0.0/tdmelodic/nn/model/encode_morae.py +49 -0
  51. tdmelodic_torch-2.0.0/tdmelodic/nn/model/encode_surface.py +59 -0
  52. tdmelodic_torch-2.0.0/tdmelodic/nn/model/modules/__init__.py +0 -0
  53. tdmelodic_torch-2.0.0/tdmelodic/nn/model/modules/cnn_attention.py +60 -0
  54. tdmelodic_torch-2.0.0/tdmelodic/nn/model/modules/dilateconvcausal1d.py +76 -0
  55. tdmelodic_torch-2.0.0/tdmelodic/nn/model/modules/gatedconv1d.py +55 -0
  56. tdmelodic_torch-2.0.0/tdmelodic/nn/model/modules/stacked_conv.py +49 -0
  57. tdmelodic_torch-2.0.0/tdmelodic/nn/net.py +47 -0
  58. tdmelodic_torch-2.0.0/tdmelodic/nn/resource/net_it_2500000 +0 -0
  59. tdmelodic_torch-2.0.0/tdmelodic/util/__init__.py +0 -0
  60. tdmelodic_torch-2.0.0/tdmelodic/util/dic_index_map.py +44 -0
  61. tdmelodic_torch-2.0.0/tdmelodic/util/util.py +18 -0
  62. tdmelodic_torch-2.0.0/tdmelodic/util/word_type.py +179 -0
  63. tdmelodic_torch-2.0.0/tdmelodic_torch.egg-info/PKG-INFO +88 -0
  64. tdmelodic_torch-2.0.0/tdmelodic_torch.egg-info/SOURCES.txt +67 -0
  65. tdmelodic_torch-2.0.0/tdmelodic_torch.egg-info/dependency_links.txt +1 -0
  66. tdmelodic_torch-2.0.0/tdmelodic_torch.egg-info/entry_points.txt +6 -0
  67. tdmelodic_torch-2.0.0/tdmelodic_torch.egg-info/not-zip-safe +1 -0
  68. tdmelodic_torch-2.0.0/tdmelodic_torch.egg-info/requires.txt +8 -0
  69. tdmelodic_torch-2.0.0/tdmelodic_torch.egg-info/top_level.txt +1 -0
@@ -0,0 +1,29 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2019-, PKSHA Technology Inc.
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ 3. Neither the name of the copyright holder nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,5 @@
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt
4
+ include tdmelodic/nn/lang/mecab/my_mecabrc
5
+ include tdmelodic/nn/resource/net_it_2500000
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.4
2
+ Name: tdmelodic-torch
3
+ Version: 2.0.0
4
+ Summary: tdmelodic: Tokyo Japanese Accent Estimator (PyTorch fork)
5
+ Home-page: https://github.com/Na2CuCl4/tdmelodic
6
+ Author: Hideyuki Tachibana, Zirui Xia
7
+ Author-email: xiazr0422@163.com
8
+ Classifier: Development Status :: 5 - Production/Stable
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: BSD License
13
+ Classifier: Operating System :: POSIX
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Topic :: Text Processing :: Linguistic
19
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Natural Language :: Japanese
22
+ Requires-Python: >=3.8
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: numpy>=1.15.4
26
+ Requires-Dist: torch>=2.0.0
27
+ Requires-Dist: mecab-python3>=0.996.1
28
+ Requires-Dist: jaconv>=0.2.4
29
+ Requires-Dist: python-Levenshtein>=0.12.0
30
+ Requires-Dist: tqdm>=4.42.1
31
+ Requires-Dist: regex>=2020.1.8
32
+ Requires-Dist: romkan>=0.2.1
33
+ Dynamic: author
34
+ Dynamic: author-email
35
+ Dynamic: classifier
36
+ Dynamic: description
37
+ Dynamic: description-content-type
38
+ Dynamic: home-page
39
+ Dynamic: license-file
40
+ Dynamic: requires-dist
41
+ Dynamic: requires-python
42
+ Dynamic: summary
43
+
44
+ <p align="center">
45
+ <img src="https://github.com/PKSHATechnology-Research/tdmelodic/raw/master/docs/imgs/logo/logo_tdmelodic.svg" width="200" />
46
+ </p>
47
+
48
+
49
+ # Tokyo Dialect MELOdic accent DICtionary (tdmelodic) generator
50
+
51
+ [![document](https://readthedocs.org/projects/tdmelodic/badge/?version=latest)](https://tdmelodic.readthedocs.io/en/latest)
52
+ [![arXiv](https://img.shields.io/badge/arXiv-2009.09679-B31B1B.svg)](https://arxiv.org/abs/2009.09679)
53
+ [![Python unittest](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/test.yml/badge.svg)](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/test.yml)
54
+ [![Docker](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/docker-image.yml/badge.svg)](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/docker-image.yml)
55
+ [![Lilypond](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/img.yml/badge.svg)](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/img.yml)
56
+ [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
57
+
58
+
59
+ This module generates a large scale accent dictionary of
60
+ Japanese (Tokyo dialect) using a neural network based technique.
61
+
62
+ > **2026-06**: Migrated the neural network backend from **Chainer** to **PyTorch**.
63
+ > The public API (`Converter.sy2a()`, `Converter.s2ya()`) is fully backward-compatible
64
+ > and produces identical inference results. Now supports Python 3.8+.
65
+
66
+ For academic use, please cite the following paper.
67
+ [[IEEE Xplore]](https://ieeexplore.ieee.org/document/9054081)
68
+ [[arXiv]](https://arxiv.org/abs/2009.09679)
69
+
70
+ ```bibtex
71
+ @inproceedings{tachibana2020icassp,
72
+ author = "H. Tachibana and Y. Katayama",
73
+ title = "Accent Estimation of {Japanese} Words from Their Surfaces and Romanizations
74
+ for Building Large Vocabulary Accent Dictionaries",
75
+ booktitle = {2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
76
+ pages = "8059--8063",
77
+ year = "2020",
78
+ doi = "10.1109/ICASSP40776.2020.9054081"
79
+ }
80
+ ```
81
+
82
+ ## Installation and Usage
83
+
84
+ - English: [tdmelodic Documentation](https://tdmelodic.readthedocs.io/en/latest)
85
+ - 日本語: [tdmelodic 利用マニュアル](https://tdmelodic.readthedocs.io/ja/latest)
86
+
87
+ ## Acknowledgement
88
+ Some part of this work is based on the results obtained from a project subsidized by the New Energy and Industrial Technology Development Organization (NEDO).
@@ -0,0 +1,45 @@
1
+ <p align="center">
2
+ <img src="https://github.com/PKSHATechnology-Research/tdmelodic/raw/master/docs/imgs/logo/logo_tdmelodic.svg" width="200" />
3
+ </p>
4
+
5
+
6
+ # Tokyo Dialect MELOdic accent DICtionary (tdmelodic) generator
7
+
8
+ [![document](https://readthedocs.org/projects/tdmelodic/badge/?version=latest)](https://tdmelodic.readthedocs.io/en/latest)
9
+ [![arXiv](https://img.shields.io/badge/arXiv-2009.09679-B31B1B.svg)](https://arxiv.org/abs/2009.09679)
10
+ [![Python unittest](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/test.yml/badge.svg)](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/test.yml)
11
+ [![Docker](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/docker-image.yml/badge.svg)](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/docker-image.yml)
12
+ [![Lilypond](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/img.yml/badge.svg)](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/img.yml)
13
+ [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
14
+
15
+
16
+ This module generates a large scale accent dictionary of
17
+ Japanese (Tokyo dialect) using a neural network based technique.
18
+
19
+ > **2026-06**: Migrated the neural network backend from **Chainer** to **PyTorch**.
20
+ > The public API (`Converter.sy2a()`, `Converter.s2ya()`) is fully backward-compatible
21
+ > and produces identical inference results. Now supports Python 3.8+.
22
+
23
+ For academic use, please cite the following paper.
24
+ [[IEEE Xplore]](https://ieeexplore.ieee.org/document/9054081)
25
+ [[arXiv]](https://arxiv.org/abs/2009.09679)
26
+
27
+ ```bibtex
28
+ @inproceedings{tachibana2020icassp,
29
+ author = "H. Tachibana and Y. Katayama",
30
+ title = "Accent Estimation of {Japanese} Words from Their Surfaces and Romanizations
31
+ for Building Large Vocabulary Accent Dictionaries",
32
+ booktitle = {2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
33
+ pages = "8059--8063",
34
+ year = "2020",
35
+ doi = "10.1109/ICASSP40776.2020.9054081"
36
+ }
37
+ ```
38
+
39
+ ## Installation and Usage
40
+
41
+ - English: [tdmelodic Documentation](https://tdmelodic.readthedocs.io/en/latest)
42
+ - 日本語: [tdmelodic 利用マニュアル](https://tdmelodic.readthedocs.io/ja/latest)
43
+
44
+ ## Acknowledgement
45
+ Some part of this work is based on the results obtained from a project subsidized by the New Energy and Industrial Technology Development Organization (NEDO).
@@ -0,0 +1,8 @@
1
+ numpy>=1.15.4
2
+ torch>=2.0.0
3
+ mecab-python3>=0.996.1
4
+ jaconv>=0.2.4
5
+ python-Levenshtein>=0.12.0
6
+ tqdm>=4.42.1
7
+ regex>=2020.1.8
8
+ romkan>=0.2.1
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env python
2
+
3
+ from setuptools import setup, find_packages
4
+ from os import path
5
+ import re, io
6
+
7
+ def _readme():
8
+ with open('README.md') as readme_file:
9
+ return readme_file.read().replace(":copyright:", "(c)")
10
+
11
+ def _requirements():
12
+ root_dir = path.abspath(path.dirname(__file__))
13
+ return [name.rstrip() for name in open(path.join(root_dir, 'requirements.txt')).readlines()]
14
+
15
+ def _get_version():
16
+ version = re.search(
17
+ r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]', # It excludes inline comment too
18
+ io.open('tdmelodic/__init__.py', encoding='utf_8_sig').read()
19
+ ).group(1)
20
+ return version
21
+
22
+ setup(
23
+ name="tdmelodic-torch",
24
+ author="Hideyuki Tachibana, Zirui Xia",
25
+ author_email='xiazr0422@163.com',
26
+ python_requires='>=3.8',
27
+ url="https://github.com/Na2CuCl4/tdmelodic",
28
+
29
+ description="tdmelodic: Tokyo Japanese Accent Estimator (PyTorch fork)",
30
+ long_description=_readme(),
31
+ long_description_content_type="text/markdown",
32
+
33
+ install_requires=_requirements(),
34
+ tests_requires=_requirements(),
35
+ setup_requires=[],
36
+
37
+ include_package_data=True,
38
+ packages=find_packages(include=['tdmelodic', 'tdmelodic.*']),
39
+
40
+ version=_get_version(),
41
+ zip_safe=False,
42
+
43
+ entry_points={
44
+ 'console_scripts':[
45
+ 'tdmelodic-convert = tdmelodic.nn.convert_dic:main',
46
+ 'tdmelodic-sy2a = tdmelodic.nn.convert:main_sy2a',
47
+ 'tdmelodic-s2ya = tdmelodic.nn.convert:main_s2ya',
48
+ 'tdmelodic-neologd-preprocess = tdmelodic.filters.neologd_preprocess:main',
49
+ 'tdmelodic-modify-unigram-cost = tdmelodic.filters.postprocess_modify_unigram_cost:main',
50
+ ]
51
+ },
52
+
53
+ classifiers=[
54
+ 'Development Status :: 5 - Production/Stable',
55
+ 'Environment :: Console',
56
+ 'Intended Audience :: Science/Research',
57
+ 'Intended Audience :: Developers',
58
+ 'License :: OSI Approved :: BSD License',
59
+ 'Operating System :: POSIX',
60
+ 'Programming Language :: Python :: 3.8',
61
+ 'Programming Language :: Python :: 3.9',
62
+ 'Programming Language :: Python :: 3.10',
63
+ 'Programming Language :: Python :: 3.11',
64
+ 'Topic :: Text Processing :: Linguistic',
65
+ 'Topic :: Multimedia :: Sound/Audio :: Speech',
66
+ 'Topic :: Scientific/Engineering :: Artificial Intelligence',
67
+ 'Natural Language :: Japanese',
68
+ ]
69
+ )
@@ -0,0 +1,11 @@
1
+ from .nn import *
2
+ from .filters import *
3
+
4
+ __copyright__ = 'Copyright (C) 2019 Hideyuki Tachibana, PKSHA Technology Inc.'
5
+ __version__ = '2.0.0'
6
+ __license__ = 'BSD-3-Clause'
7
+ __author__ = 'Hideyuki Tachibana, Zirui Xia'
8
+ __author_email__ = 'xiazr0422@163.com'
9
+ __url__ = 'https://github.com/Na2CuCl4/tdmelodic'
10
+
11
+ __all__ = ['nn', 'filters']
File without changes
@@ -0,0 +1,165 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) 2019-, PKSHA Technology Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ # -----------------------------------------------------------------------------
8
+
9
+ # -*- coding: utf-8 -*-
10
+ import sys
11
+ import os
12
+ import argparse
13
+ import regex as re
14
+ import csv
15
+ from tqdm import tqdm
16
+ import tempfile
17
+ import copy
18
+ import unicodedata
19
+ import jaconv
20
+
21
+ from tdmelodic.util.dic_index_map import get_dictionary_index_map
22
+ from tdmelodic.util.util import count_lines
23
+ from tdmelodic.util.word_type import WordType
24
+
25
+ from .yomi.basic import modify_longvowel_errors
26
+ from .yomi.basic import modify_yomi_of_numerals
27
+ from .yomi.particle_yomi import ParticleYomi
28
+ from .yomi.wrong_yomi_detection import SimpleWrongYomiDetector
29
+
30
+ class NeologdPatch(object):
31
+ def __init__(self, *args, **kwargs):
32
+ for k, v in kwargs.items():
33
+ if k != "input" and k != "output":
34
+ self.__setattr__(k, v)
35
+ self.IDX_MAP = get_dictionary_index_map(self.mode) # dictionary type
36
+ self.wt = WordType(self.mode)
37
+ self.wrong_yomi_detector = SimpleWrongYomiDetector(mode=self.mode)
38
+ self.particle_yomi = ParticleYomi()
39
+
40
+ def showinfo(self):
41
+ print("ℹ️ [ Info ]", file=sys.stderr)
42
+ self.message("| {} Hash tags will{}be removed.", self.rm_hashtag)
43
+ self.message("| {} Noisy katakana words will{}be removed.", self.rm_noisy_katakana)
44
+ self.message("| {} Person names will{}be removed.", self.rm_person)
45
+ self.message("| {} Emojis will{}be removed.", self.rm_emoji)
46
+ self.message("| {} Symbols will{}be removed.", self.rm_symbol)
47
+ self.message("| {} Numerals will{}be removed.", self.rm_numeral)
48
+ self.message("| {} Wrong yomi words will{}be removed.", self.rm_wrong_yomi)
49
+ self.message("| {} Words with special particles \"は\" and \"へ\" will{}be removed", self.rm_special_particle)
50
+ self.message("| {} Long vowel errors will{}be corrected.", self.cor_longvow)
51
+ self.message("| {} Numeral yomi errors will{}be corrected.", self.cor_yomi_num)
52
+ self.message("| {} Surface forms will{}be normalized.", self.normalize)
53
+
54
+ @classmethod
55
+ def message(cls, message, flag):
56
+ if flag:
57
+ message = message.format("✅", " ")
58
+ else:
59
+ message = message.format("‼️", " *NOT* ")
60
+ print(message, file=sys.stderr)
61
+
62
+ def add_accent_column(self, line, idx_accent=None):
63
+ line = line + ['' for i in range(10)]
64
+ line[idx_accent] = '@'
65
+ return line
66
+
67
+ def normalize_surface(self, line, idx_surface=None):
68
+ s = line[idx_surface]
69
+ s = unicodedata.normalize("NFKC", s)
70
+ s = s.upper()
71
+ s = jaconv.normalize(s, "NFKC")
72
+ s = jaconv.h2z(s, digit=True, ascii=True, kana=True)
73
+ s = s.replace("\u00A5", "\uFFE5") # yen symbol
74
+ line[idx_surface] = s
75
+ return line
76
+
77
+ def process_single_line(self, line):
78
+ # ----------------------------------------------------------------------
79
+ # remove words by word types
80
+ if self.rm_hashtag:
81
+ if self.wt.is_hashtag(line):
82
+ return None
83
+
84
+ if self.rm_noisy_katakana:
85
+ if self.wt.is_noisy_katakana(line):
86
+ return None
87
+
88
+ if self.rm_person:
89
+ if self.wt.is_person(line):
90
+ return None
91
+
92
+ if self.rm_emoji:
93
+ if self.wt.is_emoji(line):
94
+ return None
95
+
96
+ if self.rm_symbol:
97
+ if self.wt.is_symbol(line):
98
+ return None
99
+
100
+ if self.rm_numeral:
101
+ if self.wt.is_numeral(line):
102
+ return None
103
+
104
+ line = copy.deepcopy(line)
105
+
106
+ # ----------------------------------------------------------------------
107
+ # correct yomi
108
+ if self.cor_longvow:
109
+ line = modify_longvowel_errors(line, idx_yomi=self.IDX_MAP["YOMI"])
110
+
111
+ if self.cor_yomi_num:
112
+ if self.wt.is_numeral(line):
113
+ line = modify_yomi_of_numerals(line,
114
+ idx_surface=self.IDX_MAP["SURFACE"], idx_yomi=self.IDX_MAP["YOMI"])
115
+
116
+ # ----------------------------------------------------------------------
117
+ # 助詞の読みを修正する(TODO)
118
+ if self.rm_special_particle:
119
+ line = self.particle_yomi(line, self.IDX_MAP)
120
+ if line is None:
121
+ return None
122
+
123
+ # ----------------------------------------------------------------------
124
+ # normalize surface
125
+ if self.normalize:
126
+ line = self.normalize_surface(line, idx_surface=self.IDX_MAP["SURFACE"])
127
+
128
+ # ----------------------------------------------------------------------
129
+ # remove words with their yomi
130
+ if self.rm_wrong_yomi:
131
+ line = self.wrong_yomi_detector(line)
132
+ if line is None:
133
+ return None
134
+
135
+ # ----------------------------------------------------------------------
136
+ # add additional columns for compatibility with unidic-kana-accent
137
+ if self.mode == "unidic":
138
+ line = self.add_accent_column(line, idx_accent=self.IDX_MAP["ACCENT"])
139
+
140
+ # ----------------------------------------------------------------------
141
+ return line
142
+
143
+ def __call__(self, fp_in, fp_out):
144
+ self.showinfo()
145
+ L = count_lines(fp_in)
146
+ n_removed = 0
147
+ n_corrected= 0
148
+ for line in tqdm(csv.reader(fp_in), total=L):
149
+ try:
150
+ line_processed = self.process_single_line(line)
151
+ except Exception as e:
152
+ print(e)
153
+ print(line)
154
+ sys.exit(1)
155
+ if line_processed is None:
156
+ n_removed += 1
157
+ continue
158
+ if line_processed[:20] != line[:20]:
159
+ n_corrected += 1
160
+ fp_out.write(','.join(line_processed) + '\n')
161
+
162
+ print("🍺 [ Complete! ]", file=sys.stderr)
163
+ print("📊 Number of removed entries ", n_removed, file=sys.stderr)
164
+ print("📊 Number of corrected entries ", n_corrected, file=sys.stderr)
165
+ return
@@ -0,0 +1,127 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) 2019-, PKSHA Technology Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ # -----------------------------------------------------------------------------
8
+
9
+ # -*- coding: utf-8 -*-
10
+ import sys
11
+ import os
12
+ import argparse
13
+ import tempfile
14
+
15
+ from .neologd_patch import NeologdPatch
16
+ from .neologd_rmdups import rmdups
17
+
18
+ class Preprocess(object):
19
+ def __init__(self, flag_rmdups, neologd_patch, dictionary_type="unidic"):
20
+ self.flag_rmdups = flag_rmdups
21
+ self.neologd_patch_module = neologd_patch
22
+ self.dictionary_type = dictionary_type
23
+
24
+ def do_rmdups(self, fp_in):
25
+ fp_tmp = tempfile.NamedTemporaryFile("w+")
26
+ print("📌 creating a temporary file", fp_tmp.name, file=sys.stderr)
27
+ rmdups(fp_in, fp_tmp, self.dictionary_type)
28
+ fp_tmp.seek(0)
29
+ fp_in.close() # CPython's GC will automatically closes the previous fp_in without doing this
30
+ fp_in = fp_tmp
31
+ return fp_in
32
+
33
+ def do_neologd_patch(self, fp_in):
34
+ fp_tmp = tempfile.NamedTemporaryFile("w+")
35
+ print("📌 creating a temporary file", fp_tmp.name, file=sys.stderr)
36
+ self.neologd_patch_module(fp_in, fp_tmp)
37
+ fp_tmp.seek(0)
38
+ fp_in.close() # CPython's GC will automatically closes the previous fp_in without doing this
39
+ fp_in = fp_tmp
40
+ return fp_in
41
+
42
+ def copy_temp_to_output(self, fp_in, fp_out):
43
+ # output
44
+ for l in fp_in:
45
+ fp_out.write(l)
46
+ fp_in.close()
47
+ fp_out.close()
48
+
49
+ def __call__(self, fp_in, fp_out):
50
+ print("ℹ️ [ Info ]", file=sys.stderr)
51
+ NeologdPatch.message("| {} Duplicate entried will{}be removed.", self.flag_rmdups)
52
+ if self.flag_rmdups:
53
+ fp_in = self.do_rmdups(fp_in)
54
+
55
+ fp_in = self.do_neologd_patch(fp_in)
56
+
57
+ print("💾 [ Saving ]", file=sys.stderr)
58
+ self.copy_temp_to_output(fp_in, fp_out)
59
+ print("🍺 [ Done ]", file=sys.stderr)
60
+
61
+ def my_add_argument(parser, option_name, default, help_):
62
+ help_ = help_ + " <default={}>".format(str(default))
63
+ if sys.version_info >= (3, 9):
64
+ parser.add_argument("--" + option_name,
65
+ action=argparse.BooleanOptionalAction,
66
+ default=default,
67
+ help=help_)
68
+ else:
69
+ parser.add_argument("--" + option_name,
70
+ action="store_true",
71
+ default=default,
72
+ help=help_)
73
+ parser.add_argument("--no-" + option_name,
74
+ action="store_false",
75
+ dest=option_name,
76
+ default=default)
77
+
78
+ def main():
79
+ parser = argparse.ArgumentParser()
80
+ parser.add_argument(
81
+ '-i', '--input',
82
+ nargs='?',
83
+ type=argparse.FileType("r"),
84
+ default=sys.stdin,
85
+ help='input CSV file (NEologd dicitionary file) <default=STDIN>')
86
+ parser.add_argument(
87
+ '-o', '--output',
88
+ nargs='?',
89
+ type=argparse.FileType("w"),
90
+ default=sys.stdout,
91
+ help='output CSV file <default=STDOUT>')
92
+ parser.add_argument(
93
+ "-m", "--mode",
94
+ type=str,
95
+ choices=["unidic", "ipadic"],
96
+ default="unidic",
97
+ help="dictionary format type <default=unidic>",
98
+ )
99
+ my_add_argument(parser, "rmdups", True, "remove duplicate entries or not")
100
+ my_add_argument(parser, "rm_hashtag", True, "remove hash tags or not")
101
+ my_add_argument(parser, "rm_noisy_katakana", True, "remove noisy katakana words or not")
102
+ my_add_argument(parser, "rm_person", False, "remove person names or not")
103
+ my_add_argument(parser, "rm_emoji", False, "remove emojis or not")
104
+ my_add_argument(parser, "rm_symbol", False, "remove symbols or not")
105
+ my_add_argument(parser, "rm_numeral", False, "remove numerals or not")
106
+ my_add_argument(parser, "rm_wrong_yomi", True, "remove words with possibly wrong yomi or not")
107
+ my_add_argument(parser, "rm_special_particle", True, "remove words with special particles \"は\" or \"へ\"")
108
+ my_add_argument(parser, "cor_longvow", True, "correct long vowel errors or not")
109
+ my_add_argument(parser, "cor_yomi_num", True, "correct the yomi of numerals or not")
110
+ my_add_argument(parser, "normalize", False, "normalize the surface forms by applying "
111
+ "NFKC Unicode normalization, "
112
+ "capitalization of alphabets, "
113
+ "and "
114
+ "hankaku-to-zenkaku converter.")
115
+
116
+ args = parser.parse_args()
117
+ if args.input == args.output:
118
+ print("[ Error ] intput and output files should be different.", file=sys.stderr)
119
+ sys.exit(0)
120
+ try:
121
+ preprocess = Preprocess(args.rmdups, NeologdPatch(**vars(args)), dictionary_type=args.mode)
122
+ preprocess(args.input, args.output)
123
+ except Exception as e:
124
+ print(e, file=sys.stderr)
125
+
126
+ if __name__ == '__main__':
127
+ main()
@@ -0,0 +1,94 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) 2019-, PKSHA Technology Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ # -----------------------------------------------------------------------------
8
+ # -*- coding: utf-8 -*-
9
+ import sys
10
+ import os
11
+ import argparse
12
+ import regex as re
13
+ import csv
14
+ from tqdm import tqdm
15
+
16
+ import jaconv
17
+ import unicodedata
18
+ from dataclasses import dataclass
19
+
20
+ from tdmelodic.nn.lang.japanese.kansuji import numeric2kanji
21
+ from tdmelodic.util.dic_index_map import get_dictionary_index_map
22
+ from tdmelodic.util.util import count_lines
23
+ from tdmelodic.util.word_type import WordType
24
+ from .yomi.yomieval import YomiEvaluator
25
+
26
+ # ------------------------------------------------------------------------------------
27
+ def normalize_surface(text):
28
+ # hankaku
29
+ text = unicodedata.normalize("NFKC",text)
30
+ text = jaconv.h2z(text, digit=True, ascii=True, kana=False)
31
+
32
+ # kansuji
33
+ text = numeric2kanji(text)
34
+
35
+ # (株), 株式会社など
36
+ text = text.replace("(株)","・カブシキガイシャ・")
37
+ text = text.replace("(有)","・ユウゲンガイシャ・")
38
+ text = text.replace("&","・アンド・")
39
+ return text
40
+
41
+ # ------------------------------------------------------------------------------------
42
+ @dataclass
43
+ class LineInfo(object):
44
+ surf: str
45
+ yomi: str
46
+ pos: str
47
+
48
+ def get_line_info(line, IDX_MAP):
49
+ s = line[IDX_MAP["SURFACE"]]
50
+ y = line[IDX_MAP["YOMI"]]
51
+ pos = "-".join([line[i] for i in [IDX_MAP["POS1"], IDX_MAP["POS2"], IDX_MAP["POS3"]]])
52
+ s = normalize_surface(s)
53
+
54
+ return LineInfo(s, y, pos)
55
+
56
+ def rmdups(fp_in, fp_out, dictionary_type="unidic"):
57
+ """
58
+ dictionary_type: unidic or ipadic
59
+ """
60
+ IDX_MAP = get_dictionary_index_map(dictionary_type)
61
+
62
+ yomieval = YomiEvaluator()
63
+ prev_line = [""] * 100
64
+ c = 0
65
+ L = count_lines(fp_in)
66
+ wt = WordType(dictionary_type)
67
+
68
+ print("ℹ️ [ Removing duplicate entries ]", file=sys.stderr)
69
+ for i, curr_line in enumerate(tqdm(csv.reader(fp_in), total=L)):
70
+ prev = get_line_info(prev_line, IDX_MAP)
71
+ curr = get_line_info(curr_line, IDX_MAP)
72
+
73
+ if prev.surf == curr.surf and prev.pos == curr.pos and \
74
+ not wt.is_person(prev_line) and not wt.is_placename(prev_line):
75
+ # if the surface form and pos are the same
76
+ distance_p = yomieval.eval(prev.surf, prev.yomi)
77
+ distance_c = yomieval.eval(curr.surf, curr.yomi)
78
+ else:
79
+ distance_p = 0
80
+ distance_c = 100
81
+
82
+ if distance_p > distance_c:
83
+ c += 1
84
+ # if c % 100 == 0:
85
+ # print(c, curr.surf, "| deleted: ", prev.yomi, distance_p, " | left: ", curr.yomi, distance_c, file=sys.stderr)
86
+ else:
87
+ if i != 0:
88
+ fp_out.write(",".join(prev_line) + "\n")
89
+
90
+ prev_line = curr_line
91
+ continue
92
+
93
+ fp_out.write(",".join(prev_line) + "\n")
94
+ print("📊 Number of removed duplicate entries ", c, file=sys.stderr)