handic 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
handic-0.1.0/LICENSE ADDED
@@ -0,0 +1,41 @@
1
+ MIT License
2
+
3
+ Copyright 2020 Paul McCann
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
9
+ of the Software, and to permit persons to whom the Software is furnished to do
10
+ so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ Copyright 2024 Yoshinori Sugai
24
+
25
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
26
+ this software and associated documentation files (the "Software"), to deal in
27
+ the Software without restriction, including without limitation the rights to
28
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
29
+ of the Software, and to permit persons to whom the Software is furnished to do
30
+ so, subject to the following conditions:
31
+
32
+ The above copyright notice and this permission notice shall be included in all
33
+ copies or substantial portions of the Software.
34
+
35
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
37
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
38
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
39
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
40
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41
+ SOFTWARE.
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2011-, Yoshinori Sugai
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1 @@
1
+ include handic/dicdir/*
handic-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.1
2
+ Name: handic
3
+ Version: 0.1.0
4
+ Summary: HanDic package for installing via pip.
5
+ Author: Yoshinori Sugai
6
+ Author-email: okikirmui+github@gmail.com
7
+ License: MIT License
8
+ Project-URL: Repository, https://github.com/okikirmui/handic-py
9
+ Keywords: handic,MeCab,Korean Language,morphological analysis,morphological analysis dictionary,korean text processing
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python
12
+ Classifier: Natural Language :: Korean
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Topic :: Text Processing
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ License-File: LICENSE.handic
19
+
20
+ # handic-py
21
+
22
+ This is a package to install [HanDic](https://github.com/okikirmui/handic), a dictionary for morphological analysis of Korean languages, via pip and use it in Python.
23
+
24
+ To use this package for morphological analysis, the MeCab wrapper such as [mecab-python3](https://github.com/SamuraiT/mecab-python3) is required.
25
+
26
+ ## Installation
27
+
28
+ from PyPI:
29
+
30
+ ```Shell
31
+ pip install handic
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ Since HanDic requires Hangul Jamo(Unicode Hangul Jamo) as input, please convert Hangul (Unicode Hangul Syllables) using modules such as [jamotools](https://pypi.org/project/jamotools/), or `tools/k2jamo.py` script included in HanDic.
37
+
38
+ ### basic
39
+
40
+ example:
41
+
42
+ ```Python
43
+ import MeCab
44
+ import handic
45
+ import jamotools
46
+
47
+ mecaboption = f'-r /dev/null -d {handic.DICDIR}'
48
+
49
+ tokenizer = MeCab.Tagger(mecaboption)
50
+ tokenizer.parse('')
51
+
52
+ # 《표준국어대사전》 "형태소" 뜻풀이
53
+ sentence = u'뜻을 가진 가장 작은 말의 단위. ‘이야기책’의 ‘이야기’, ‘책’ 따위이다.'
54
+
55
+ jamo = jamotools.split_syllables(sentence, jamo_type="JAMO")
56
+
57
+ node = tokenizer.parseToNode(jamo)
58
+ while node:
59
+ print(node.surface, node.feature)
60
+ node = node.next
61
+ ```
62
+
63
+ result:
64
+
65
+ ```Shell
66
+ BOS/EOS,*,*,*,*,*,*,*,*,*,*
67
+ 뜻 Noun,普通,*,*,*,뜻,뜻,*,*,B,NNG
68
+ 을 Ending,助詞,対格,*,*,을02,을,*,*,*,JKO
69
+ 가지 Verb,自立,*,語基2,*,가지다,가지,*,*,A,VV
70
+ ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
71
+ 가장 Adverb,一般,*,*,*,가장01,가장,*,*,A,MAG
72
+ 작으 Adjective,自立,*,語基2,*,작다01,작으,*,*,A,VA
73
+ ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
74
+ 말 Noun,普通,動作,*,*,말01,말,*,*,A,NNG
75
+ 의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
76
+ 단위 Noun,普通,*,*,*,단위02,단위,單位,*,C,NNG
77
+ . Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
78
+ ‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
79
+ 이야기책 Noun,普通,*,*,*,이야기책,이야기책,이야기冊,*,*,NNG
80
+ ’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
81
+ 의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
82
+ ‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
83
+ 이야기 Noun,普通,動作,*,*,이야기,이야기,*,*,A,NNG
84
+ ’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
85
+ , Symbol,コンマ,*,*,*,",",",",*,*,*,SP
86
+ ‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
87
+ 책 Noun,普通,*,*,*,책01,책,冊,*,A,NNG
88
+ ’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
89
+ 따위 Noun,依存名詞,*,*,*,따위,따위,*,*,*,NNB
90
+ 이 Siteisi,非自立,*,語基1,*,이다,이,*,*,*,VCP
91
+ 다 Ending,語尾,終止形,*,1接続,다06,다,*,*,*,EF
92
+ . Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
93
+ BOS/EOS,*,*,*,*,*,*,*,*,*,*
94
+ ```
95
+
96
+ ### Tokenize
97
+
98
+ example:
99
+
100
+ ```Python
101
+ mecaboption = f'-r /dev/null -d {handic.DICDIR} -Otokenize'
102
+ tokenizer = MeCab.Tagger(mecaboption)
103
+
104
+ print(tokenizer.parse(jamo))
105
+ ```
106
+
107
+ result:
108
+
109
+ ```Shell
110
+ 뜻 을 가지 ㄴ 가장 작으 ㄴ 말 의 단위 . ‘ 이야기책 ’ 의 ‘ 이야기 ’ , ‘ 책 ’ 따위 이 다 .
111
+ ```
112
+
113
+ ### Extracting specific POS
114
+
115
+ example:
116
+
117
+ ```Python
118
+ mecaboption = f'-r /dev/null -d {handic.DICDIR}'
119
+
120
+ tokenizer = MeCab.Tagger(mecaboption)
121
+ tokenizer.parse('')
122
+
123
+ node = tokenizer.parseToNode(jamo)
124
+ while node:
125
+ # 일반명사(pos-tag: NNG)만 추출
126
+ if node.feature.split(',')[10] in ['NNG']:
127
+ print(node.feature.split(',')[5])
128
+ node = node.next
129
+ ```
130
+
131
+ result:
132
+
133
+ ```Shell
134
+
135
+ 말01
136
+ 단위02
137
+ 이야기책
138
+ 이야기
139
+ 책01
140
+ ```
141
+
142
+ ## Features
143
+
144
+ Here is the list of features included in HanDic. For more information, see the [HanDic 품사 정보](https://github.com/okikirmui/handic/blob/main/docs/pos_detail.md).
145
+
146
+ - 품사1, 품사2, 품사3: part of speech(index: 0-2)
147
+ - 활용형: conjugation "base"(ex. `語基1`, `語基2`, `語基3`)(index: 3)
148
+ - 접속 정보: which "base" the ending is attached to(ex. `1接続`, `2接続`, etc.)(index: 4)
149
+ - 사전 항목: base forms(index: 5)
150
+ - 표층형: surface(index: 6)
151
+ - 한자: for sino-words(index: 7)
152
+ - 보충 정보: miscellaneous informations(index: 8)
153
+ - 학습 수준: learning level(index: 9)
154
+ - 세종계획 품사 태그: pos-tag(index: 10)
155
+
156
+ ## License
157
+
158
+ This code is licensed under the MIT license. HanDic is copyright Yoshinori Sugai and distributed under the [BSD license](./LICENSE.handic).
159
+
160
+ ## Acknowledgment
161
+
162
+ This repository is forked from [unidic-lite](https://github.com/polm/unidic-lite) with some modifications and file additions and deletions.
handic-0.1.0/README.md ADDED
@@ -0,0 +1,143 @@
1
+ # handic-py
2
+
3
+ This is a package to install [HanDic](https://github.com/okikirmui/handic), a dictionary for morphological analysis of Korean languages, via pip and use it in Python.
4
+
5
+ To use this package for morphological analysis, the MeCab wrapper such as [mecab-python3](https://github.com/SamuraiT/mecab-python3) is required.
6
+
7
+ ## Installation
8
+
9
+ from PyPI:
10
+
11
+ ```Shell
12
+ pip install handic
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ Since HanDic requires Hangul Jamo(Unicode Hangul Jamo) as input, please convert Hangul (Unicode Hangul Syllables) using modules such as [jamotools](https://pypi.org/project/jamotools/), or `tools/k2jamo.py` script included in HanDic.
18
+
19
+ ### basic
20
+
21
+ example:
22
+
23
+ ```Python
24
+ import MeCab
25
+ import handic
26
+ import jamotools
27
+
28
+ mecaboption = f'-r /dev/null -d {handic.DICDIR}'
29
+
30
+ tokenizer = MeCab.Tagger(mecaboption)
31
+ tokenizer.parse('')
32
+
33
+ # 《표준국어대사전》 "형태소" 뜻풀이
34
+ sentence = u'뜻을 가진 가장 작은 말의 단위. ‘이야기책’의 ‘이야기’, ‘책’ 따위이다.'
35
+
36
+ jamo = jamotools.split_syllables(sentence, jamo_type="JAMO")
37
+
38
+ node = tokenizer.parseToNode(jamo)
39
+ while node:
40
+ print(node.surface, node.feature)
41
+ node = node.next
42
+ ```
43
+
44
+ result:
45
+
46
+ ```Shell
47
+ BOS/EOS,*,*,*,*,*,*,*,*,*,*
48
+ 뜻 Noun,普通,*,*,*,뜻,뜻,*,*,B,NNG
49
+ 을 Ending,助詞,対格,*,*,을02,을,*,*,*,JKO
50
+ 가지 Verb,自立,*,語基2,*,가지다,가지,*,*,A,VV
51
+ ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
52
+ 가장 Adverb,一般,*,*,*,가장01,가장,*,*,A,MAG
53
+ 작으 Adjective,自立,*,語基2,*,작다01,작으,*,*,A,VA
54
+ ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
55
+ 말 Noun,普通,動作,*,*,말01,말,*,*,A,NNG
56
+ 의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
57
+ 단위 Noun,普通,*,*,*,단위02,단위,單位,*,C,NNG
58
+ . Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
59
+ ‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
60
+ 이야기책 Noun,普通,*,*,*,이야기책,이야기책,이야기冊,*,*,NNG
61
+ ’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
62
+ 의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
63
+ ‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
64
+ 이야기 Noun,普通,動作,*,*,이야기,이야기,*,*,A,NNG
65
+ ’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
66
+ , Symbol,コンマ,*,*,*,",",",",*,*,*,SP
67
+ ‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
68
+ 책 Noun,普通,*,*,*,책01,책,冊,*,A,NNG
69
+ ’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
70
+ 따위 Noun,依存名詞,*,*,*,따위,따위,*,*,*,NNB
71
+ 이 Siteisi,非自立,*,語基1,*,이다,이,*,*,*,VCP
72
+ 다 Ending,語尾,終止形,*,1接続,다06,다,*,*,*,EF
73
+ . Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
74
+ BOS/EOS,*,*,*,*,*,*,*,*,*,*
75
+ ```
76
+
77
+ ### Tokenize
78
+
79
+ example:
80
+
81
+ ```Python
82
+ mecaboption = f'-r /dev/null -d {handic.DICDIR} -Otokenize'
83
+ tokenizer = MeCab.Tagger(mecaboption)
84
+
85
+ print(tokenizer.parse(jamo))
86
+ ```
87
+
88
+ result:
89
+
90
+ ```Shell
91
+ 뜻 을 가지 ㄴ 가장 작으 ㄴ 말 의 단위 . ‘ 이야기책 ’ 의 ‘ 이야기 ’ , ‘ 책 ’ 따위 이 다 .
92
+ ```
93
+
94
+ ### Extracting specific POS
95
+
96
+ example:
97
+
98
+ ```Python
99
+ mecaboption = f'-r /dev/null -d {handic.DICDIR}'
100
+
101
+ tokenizer = MeCab.Tagger(mecaboption)
102
+ tokenizer.parse('')
103
+
104
+ node = tokenizer.parseToNode(jamo)
105
+ while node:
106
+ # 일반명사(pos-tag: NNG)만 추출
107
+ if node.feature.split(',')[10] in ['NNG']:
108
+ print(node.feature.split(',')[5])
109
+ node = node.next
110
+ ```
111
+
112
+ result:
113
+
114
+ ```Shell
115
+
116
+ 말01
117
+ 단위02
118
+ 이야기책
119
+ 이야기
120
+ 책01
121
+ ```
122
+
123
+ ## Features
124
+
125
+ Here is the list of features included in HanDic. For more information, see the [HanDic 품사 정보](https://github.com/okikirmui/handic/blob/main/docs/pos_detail.md).
126
+
127
+ - 품사1, 품사2, 품사3: part of speech(index: 0-2)
128
+ - 활용형: conjugation "base"(ex. `語基1`, `語基2`, `語基3`)(index: 3)
129
+ - 접속 정보: which "base" the ending is attached to(ex. `1接続`, `2接続`, etc.)(index: 4)
130
+ - 사전 항목: base forms(index: 5)
131
+ - 표층형: surface(index: 6)
132
+ - 한자: for sino-words(index: 7)
133
+ - 보충 정보: miscellaneous informations(index: 8)
134
+ - 학습 수준: learning level(index: 9)
135
+ - 세종계획 품사 태그: pos-tag(index: 10)
136
+
137
+ ## License
138
+
139
+ This code is licensed under the MIT license. HanDic is copyright Yoshinori Sugai and distributed under the [BSD license](./LICENSE.handic).
140
+
141
+ ## Acknowledgment
142
+
143
+ This repository is forked from [unidic-lite](https://github.com/polm/unidic-lite) with some modifications and file additions and deletions.
@@ -0,0 +1 @@
1
+ from .handic import DICDIR, VERSION
@@ -0,0 +1,7 @@
1
+ # HanDic
2
+
3
+ HanDic is a dictionary for morphological analysis of Korean languages with the morphological analysis engine MeCab. It consists of over 120,000 entries and was trained and built with data centered on written language, such as newspapers, news, novels, and textbooks.
4
+
5
+ For more information, please refer to [HanDic](https://github.com/okikirmui/handic).
6
+
7
+ HanDic is copyright Yoshinori Sugai and distributed under the BSD-3-Clause license.
Binary file
@@ -0,0 +1,15 @@
1
+ cost-factor = 800
2
+ bos-feature = BOS/EOS,*,*,*,*,*,*,*,*,*,*
3
+ eval-size = 5
4
+ unk-eval-size = 3
5
+ config-charset = utf8
6
+
7
+ ; ChaSen for KH Coder
8
+ node-format-chasen = %f[6]\t%M\t%f[5]\t%F-[0,1,2]\t%f[3]\t%f[4]\n
9
+ unk-format-chasen = %f[6]\t%M\t%f[6]\t%F-[0,1,2]\t\t\n
10
+ eos-format-chasen = EOS\n
11
+
12
+ ; Tokenize option for Korean
13
+ node-format-tokenize = %f[6]\s
14
+ unk-format-tokenize = %m\s
15
+ eos-format-tokenize = \n
Binary file
@@ -0,0 +1,2 @@
1
+ # This is a dummy file
2
+ # It has to exist, but it can be empty
Binary file
Binary file
@@ -0,0 +1 @@
1
+ v20241027
@@ -0,0 +1,13 @@
1
+ import os
2
+ import sys
3
+
4
+ def get_version(dicdir):
5
+ vpath = os.path.join(dicdir, 'version')
6
+ with open(vpath) as vfile:
7
+ return vfile.read().strip()
8
+
9
+ _curdir = os.path.dirname(__file__)
10
+
11
+ # This will be used elsewhere to initialize the tagger
12
+ DICDIR = os.path.join(_curdir, 'dicdir')
13
+ VERSION = get_version(DICDIR)
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.1
2
+ Name: handic
3
+ Version: 0.1.0
4
+ Summary: HanDic package for installing via pip.
5
+ Author: Yoshinori Sugai
6
+ Author-email: okikirmui+github@gmail.com
7
+ License: MIT License
8
+ Project-URL: Repository, https://github.com/okikirmui/handic-py
9
+ Keywords: handic,MeCab,Korean Language,morphological analysis,morphological analysis dictionary,korean text processing
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python
12
+ Classifier: Natural Language :: Korean
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Topic :: Text Processing
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ License-File: LICENSE.handic
19
+
20
+ # handic-py
21
+
22
+ This is a package to install [HanDic](https://github.com/okikirmui/handic), a dictionary for morphological analysis of Korean languages, via pip and use it in Python.
23
+
24
+ To use this package for morphological analysis, the MeCab wrapper such as [mecab-python3](https://github.com/SamuraiT/mecab-python3) is required.
25
+
26
+ ## Installation
27
+
28
+ from PyPI:
29
+
30
+ ```Shell
31
+ pip install handic
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ Since HanDic requires Hangul Jamo(Unicode Hangul Jamo) as input, please convert Hangul (Unicode Hangul Syllables) using modules such as [jamotools](https://pypi.org/project/jamotools/), or `tools/k2jamo.py` script included in HanDic.
37
+
38
+ ### basic
39
+
40
+ example:
41
+
42
+ ```Python
43
+ import MeCab
44
+ import handic
45
+ import jamotools
46
+
47
+ mecaboption = f'-r /dev/null -d {handic.DICDIR}'
48
+
49
+ tokenizer = MeCab.Tagger(mecaboption)
50
+ tokenizer.parse('')
51
+
52
+ # 《표준국어대사전》 "형태소" 뜻풀이
53
+ sentence = u'뜻을 가진 가장 작은 말의 단위. ‘이야기책’의 ‘이야기’, ‘책’ 따위이다.'
54
+
55
+ jamo = jamotools.split_syllables(sentence, jamo_type="JAMO")
56
+
57
+ node = tokenizer.parseToNode(jamo)
58
+ while node:
59
+ print(node.surface, node.feature)
60
+ node = node.next
61
+ ```
62
+
63
+ result:
64
+
65
+ ```Shell
66
+ BOS/EOS,*,*,*,*,*,*,*,*,*,*
67
+ 뜻 Noun,普通,*,*,*,뜻,뜻,*,*,B,NNG
68
+ 을 Ending,助詞,対格,*,*,을02,을,*,*,*,JKO
69
+ 가지 Verb,自立,*,語基2,*,가지다,가지,*,*,A,VV
70
+ ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
71
+ 가장 Adverb,一般,*,*,*,가장01,가장,*,*,A,MAG
72
+ 작으 Adjective,自立,*,語基2,*,작다01,작으,*,*,A,VA
73
+ ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
74
+ 말 Noun,普通,動作,*,*,말01,말,*,*,A,NNG
75
+ 의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
76
+ 단위 Noun,普通,*,*,*,단위02,단위,單位,*,C,NNG
77
+ . Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
78
+ ‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
79
+ 이야기책 Noun,普通,*,*,*,이야기책,이야기책,이야기冊,*,*,NNG
80
+ ’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
81
+ 의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
82
+ ‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
83
+ 이야기 Noun,普通,動作,*,*,이야기,이야기,*,*,A,NNG
84
+ ’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
85
+ , Symbol,コンマ,*,*,*,",",",",*,*,*,SP
86
+ ‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
87
+ 책 Noun,普通,*,*,*,책01,책,冊,*,A,NNG
88
+ ’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
89
+ 따위 Noun,依存名詞,*,*,*,따위,따위,*,*,*,NNB
90
+ 이 Siteisi,非自立,*,語基1,*,이다,이,*,*,*,VCP
91
+ 다 Ending,語尾,終止形,*,1接続,다06,다,*,*,*,EF
92
+ . Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
93
+ BOS/EOS,*,*,*,*,*,*,*,*,*,*
94
+ ```
95
+
96
+ ### Tokenize
97
+
98
+ example:
99
+
100
+ ```Python
101
+ mecaboption = f'-r /dev/null -d {handic.DICDIR} -Otokenize'
102
+ tokenizer = MeCab.Tagger(mecaboption)
103
+
104
+ print(tokenizer.parse(jamo))
105
+ ```
106
+
107
+ result:
108
+
109
+ ```Shell
110
+ 뜻 을 가지 ㄴ 가장 작으 ㄴ 말 의 단위 . ‘ 이야기책 ’ 의 ‘ 이야기 ’ , ‘ 책 ’ 따위 이 다 .
111
+ ```
112
+
113
+ ### Extracting specific POS
114
+
115
+ example:
116
+
117
+ ```Python
118
+ mecaboption = f'-r /dev/null -d {handic.DICDIR}'
119
+
120
+ tokenizer = MeCab.Tagger(mecaboption)
121
+ tokenizer.parse('')
122
+
123
+ node = tokenizer.parseToNode(jamo)
124
+ while node:
125
+ # 일반명사(pos-tag: NNG)만 추출
126
+ if node.feature.split(',')[10] in ['NNG']:
127
+ print(node.feature.split(',')[5])
128
+ node = node.next
129
+ ```
130
+
131
+ result:
132
+
133
+ ```Shell
134
+
135
+ 말01
136
+ 단위02
137
+ 이야기책
138
+ 이야기
139
+ 책01
140
+ ```
141
+
142
+ ## Features
143
+
144
+ Here is the list of features included in HanDic. For more information, see the [HanDic 품사 정보](https://github.com/okikirmui/handic/blob/main/docs/pos_detail.md).
145
+
146
+ - 품사1, 품사2, 품사3: part of speech(index: 0-2)
147
+ - 활용형: conjugation "base"(ex. `語基1`, `語基2`, `語基3`)(index: 3)
148
+ - 접속 정보: which "base" the ending is attached to(ex. `1接続`, `2接続`, etc.)(index: 4)
149
+ - 사전 항목: base forms(index: 5)
150
+ - 표층형: surface(index: 6)
151
+ - 한자: for sino-words(index: 7)
152
+ - 보충 정보: miscellaneous informations(index: 8)
153
+ - 학습 수준: learning level(index: 9)
154
+ - 세종계획 품사 태그: pos-tag(index: 10)
155
+
156
+ ## License
157
+
158
+ This code is licensed under the MIT license. HanDic is copyright Yoshinori Sugai and distributed under the [BSD license](./LICENSE.handic).
159
+
160
+ ## Acknowledgment
161
+
162
+ This repository is forked from [unidic-lite](https://github.com/polm/unidic-lite) with some modifications and file additions and deletions.
@@ -0,0 +1,19 @@
1
+ LICENSE
2
+ LICENSE.handic
3
+ MANIFEST.in
4
+ README.md
5
+ pyproject.toml
6
+ handic/__init__.py
7
+ handic/handic.py
8
+ handic.egg-info/PKG-INFO
9
+ handic.egg-info/SOURCES.txt
10
+ handic.egg-info/dependency_links.txt
11
+ handic.egg-info/top_level.txt
12
+ handic/dicdir/README.md
13
+ handic/dicdir/char.bin
14
+ handic/dicdir/dicrc
15
+ handic/dicdir/matrix.bin
16
+ handic/dicdir/mecabrc
17
+ handic/dicdir/sys.dic
18
+ handic/dicdir/unk.dic
19
+ handic/dicdir/version
@@ -0,0 +1 @@
1
+ handic
@@ -0,0 +1,28 @@
1
+ [build-system]
2
+ requires = ["setuptools >= 61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "handic"
7
+ version = "0.1.0"
8
+ requires-python = ">= 3.8"
9
+ authors = [
10
+ {name="Yoshinori Sugai"},
11
+ {email="okikirmui+github@gmail.com"}
12
+ ]
13
+ description = "HanDic package for installing via pip."
14
+ readme = "README.md"
15
+ license = {text = "MIT License"}
16
+ keywords = [
17
+ "handic", "MeCab", "Korean Language", "morphological analysis", "morphological analysis dictionary", "korean text processing"
18
+ ]
19
+ classifiers = [
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python",
22
+ "Natural Language :: Korean",
23
+ "Operating System :: OS Independent",
24
+ "Topic :: Text Processing"
25
+ ]
26
+
27
+ [project.urls]
28
+ Repository = "https://github.com/okikirmui/handic-py"
handic-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+