handic 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- handic-0.1.0/LICENSE +41 -0
- handic-0.1.0/LICENSE.handic +28 -0
- handic-0.1.0/MANIFEST.in +1 -0
- handic-0.1.0/PKG-INFO +162 -0
- handic-0.1.0/README.md +143 -0
- handic-0.1.0/handic/__init__.py +1 -0
- handic-0.1.0/handic/dicdir/README.md +7 -0
- handic-0.1.0/handic/dicdir/char.bin +0 -0
- handic-0.1.0/handic/dicdir/dicrc +15 -0
- handic-0.1.0/handic/dicdir/matrix.bin +0 -0
- handic-0.1.0/handic/dicdir/mecabrc +2 -0
- handic-0.1.0/handic/dicdir/sys.dic +0 -0
- handic-0.1.0/handic/dicdir/unk.dic +0 -0
- handic-0.1.0/handic/dicdir/version +1 -0
- handic-0.1.0/handic/handic.py +13 -0
- handic-0.1.0/handic.egg-info/PKG-INFO +162 -0
- handic-0.1.0/handic.egg-info/SOURCES.txt +19 -0
- handic-0.1.0/handic.egg-info/dependency_links.txt +1 -0
- handic-0.1.0/handic.egg-info/top_level.txt +1 -0
- handic-0.1.0/pyproject.toml +28 -0
- handic-0.1.0/setup.cfg +4 -0
handic-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright 2020 Paul McCann
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
7
|
+
the Software without restriction, including without limitation the rights to
|
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
9
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
|
10
|
+
so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
Copyright 2024 Yoshinori Sugai
|
|
24
|
+
|
|
25
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
26
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
27
|
+
the Software without restriction, including without limitation the rights to
|
|
28
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
29
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
|
30
|
+
so, subject to the following conditions:
|
|
31
|
+
|
|
32
|
+
The above copyright notice and this permission notice shall be included in all
|
|
33
|
+
copies or substantial portions of the Software.
|
|
34
|
+
|
|
35
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
36
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
37
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
38
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
39
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
40
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
41
|
+
SOFTWARE.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2011-, Yoshinori Sugai
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
16
|
+
contributors may be used to endorse or promote products derived from
|
|
17
|
+
this software without specific prior written permission.
|
|
18
|
+
|
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
20
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
21
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
23
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
handic-0.1.0/MANIFEST.in
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include handic/dicdir/*
|
handic-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: handic
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: HanDic package for installing via pip.
|
|
5
|
+
Author: Yoshinori Sugai
|
|
6
|
+
Author-email: okikirmui+github@gmail.com
|
|
7
|
+
License: MIT License
|
|
8
|
+
Project-URL: Repository, https://github.com/okikirmui/handic-py
|
|
9
|
+
Keywords: handic,MeCab,Korean Language,morphological analysis,morphological analysis dictionary,korean text processing
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Natural Language :: Korean
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Text Processing
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
License-File: LICENSE.handic
|
|
19
|
+
|
|
20
|
+
# handic-py
|
|
21
|
+
|
|
22
|
+
This is a package to install [HanDic](https://github.com/okikirmui/handic), a dictionary for morphological analysis of Korean languages, via pip and use it in Python.
|
|
23
|
+
|
|
24
|
+
To use this package for morphological analysis, the MeCab wrapper such as [mecab-python3](https://github.com/SamuraiT/mecab-python3) is required.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
from PyPI:
|
|
29
|
+
|
|
30
|
+
```Shell
|
|
31
|
+
pip install handic
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
Since HanDic requires Hangul Jamo(Unicode Hangul Jamo) as input, please convert Hangul (Unicode Hangul Syllables) using modules such as [jamotools](https://pypi.org/project/jamotools/), or `tools/k2jamo.py` script included in HanDic.
|
|
37
|
+
|
|
38
|
+
### basic
|
|
39
|
+
|
|
40
|
+
example:
|
|
41
|
+
|
|
42
|
+
```Python
|
|
43
|
+
import MeCab
|
|
44
|
+
import handic
|
|
45
|
+
import jamotools
|
|
46
|
+
|
|
47
|
+
mecaboption = f'-r /dev/null -d {handic.DICDIR}'
|
|
48
|
+
|
|
49
|
+
tokenizer = MeCab.Tagger(mecaboption)
|
|
50
|
+
tokenizer.parse('')
|
|
51
|
+
|
|
52
|
+
# 《표준국어대사전》 "형태소" 뜻풀이
|
|
53
|
+
sentence = u'뜻을 가진 가장 작은 말의 단위. ‘이야기책’의 ‘이야기’, ‘책’ 따위이다.'
|
|
54
|
+
|
|
55
|
+
jamo = jamotools.split_syllables(sentence, jamo_type="JAMO")
|
|
56
|
+
|
|
57
|
+
node = tokenizer.parseToNode(jamo)
|
|
58
|
+
while node:
|
|
59
|
+
print(node.surface, node.feature)
|
|
60
|
+
node = node.next
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
result:
|
|
64
|
+
|
|
65
|
+
```Shell
|
|
66
|
+
BOS/EOS,*,*,*,*,*,*,*,*,*,*
|
|
67
|
+
뜻 Noun,普通,*,*,*,뜻,뜻,*,*,B,NNG
|
|
68
|
+
을 Ending,助詞,対格,*,*,을02,을,*,*,*,JKO
|
|
69
|
+
가지 Verb,自立,*,語基2,*,가지다,가지,*,*,A,VV
|
|
70
|
+
ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
|
|
71
|
+
가장 Adverb,一般,*,*,*,가장01,가장,*,*,A,MAG
|
|
72
|
+
작으 Adjective,自立,*,語基2,*,작다01,작으,*,*,A,VA
|
|
73
|
+
ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
|
|
74
|
+
말 Noun,普通,動作,*,*,말01,말,*,*,A,NNG
|
|
75
|
+
의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
|
|
76
|
+
단위 Noun,普通,*,*,*,단위02,단위,單位,*,C,NNG
|
|
77
|
+
. Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
|
|
78
|
+
‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
|
|
79
|
+
이야기책 Noun,普通,*,*,*,이야기책,이야기책,이야기冊,*,*,NNG
|
|
80
|
+
’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
|
|
81
|
+
의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
|
|
82
|
+
‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
|
|
83
|
+
이야기 Noun,普通,動作,*,*,이야기,이야기,*,*,A,NNG
|
|
84
|
+
’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
|
|
85
|
+
, Symbol,コンマ,*,*,*,",",",",*,*,*,SP
|
|
86
|
+
‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
|
|
87
|
+
책 Noun,普通,*,*,*,책01,책,冊,*,A,NNG
|
|
88
|
+
’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
|
|
89
|
+
따위 Noun,依存名詞,*,*,*,따위,따위,*,*,*,NNB
|
|
90
|
+
이 Siteisi,非自立,*,語基1,*,이다,이,*,*,*,VCP
|
|
91
|
+
다 Ending,語尾,終止形,*,1接続,다06,다,*,*,*,EF
|
|
92
|
+
. Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
|
|
93
|
+
BOS/EOS,*,*,*,*,*,*,*,*,*,*
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Tokenize
|
|
97
|
+
|
|
98
|
+
example:
|
|
99
|
+
|
|
100
|
+
```Python
|
|
101
|
+
mecaboption = f'-r /dev/null -d {handic.DICDIR} -Otokenize'
|
|
102
|
+
tokenizer = MeCab.Tagger(mecaboption)
|
|
103
|
+
|
|
104
|
+
print(tokenizer.parse(jamo))
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
result:
|
|
108
|
+
|
|
109
|
+
```Shell
|
|
110
|
+
뜻 을 가지 ㄴ 가장 작으 ㄴ 말 의 단위 . ‘ 이야기책 ’ 의 ‘ 이야기 ’ , ‘ 책 ’ 따위 이 다 .
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Extracting specific POS
|
|
114
|
+
|
|
115
|
+
example:
|
|
116
|
+
|
|
117
|
+
```Python
|
|
118
|
+
mecaboption = f'-r /dev/null -d {handic.DICDIR}'
|
|
119
|
+
|
|
120
|
+
tokenizer = MeCab.Tagger(mecaboption)
|
|
121
|
+
tokenizer.parse('')
|
|
122
|
+
|
|
123
|
+
node = tokenizer.parseToNode(jamo)
|
|
124
|
+
while node:
|
|
125
|
+
# 일반명사(pos-tag: NNG)만 추출
|
|
126
|
+
if node.feature.split(',')[10] in ['NNG']:
|
|
127
|
+
print(node.feature.split(',')[5])
|
|
128
|
+
node = node.next
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
result:
|
|
132
|
+
|
|
133
|
+
```Shell
|
|
134
|
+
뜻
|
|
135
|
+
말01
|
|
136
|
+
단위02
|
|
137
|
+
이야기책
|
|
138
|
+
이야기
|
|
139
|
+
책01
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Features
|
|
143
|
+
|
|
144
|
+
Here is the list of features included in HanDic. For more information, see the [HanDic 품사 정보](https://github.com/okikirmui/handic/blob/main/docs/pos_detail.md).
|
|
145
|
+
|
|
146
|
+
- 품사1, 품사2, 품사3: part of speech(index: 0-2)
|
|
147
|
+
- 활용형: conjugation "base"(ex. `語基1`, `語基2`, `語基3`)(index: 3)
|
|
148
|
+
- 접속 정보: which "base" the ending is attached to(ex. `1接続`, `2接続`, etc.)(index: 4)
|
|
149
|
+
- 사전 항목: base forms(index: 5)
|
|
150
|
+
- 표층형: surface(index: 6)
|
|
151
|
+
- 한자: for sino-words(index: 7)
|
|
152
|
+
- 보충 정보: miscellaneous informations(index: 8)
|
|
153
|
+
- 학습 수준: learning level(index: 9)
|
|
154
|
+
- 세종계획 품사 태그: pos-tag(index: 10)
|
|
155
|
+
|
|
156
|
+
## License
|
|
157
|
+
|
|
158
|
+
This code is licensed under the MIT license. HanDic is copyright Yoshinori Sugai and distributed under the [BSD license](./LICENSE.handic).
|
|
159
|
+
|
|
160
|
+
## Acknowledgment
|
|
161
|
+
|
|
162
|
+
This repository is forked from [unidic-lite](https://github.com/polm/unidic-lite) with some modifications and file additions and deletions.
|
handic-0.1.0/README.md
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# handic-py
|
|
2
|
+
|
|
3
|
+
This is a package to install [HanDic](https://github.com/okikirmui/handic), a dictionary for morphological analysis of Korean languages, via pip and use it in Python.
|
|
4
|
+
|
|
5
|
+
To use this package for morphological analysis, the MeCab wrapper such as [mecab-python3](https://github.com/SamuraiT/mecab-python3) is required.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
from PyPI:
|
|
10
|
+
|
|
11
|
+
```Shell
|
|
12
|
+
pip install handic
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
Since HanDic requires Hangul Jamo(Unicode Hangul Jamo) as input, please convert Hangul (Unicode Hangul Syllables) using modules such as [jamotools](https://pypi.org/project/jamotools/), or `tools/k2jamo.py` script included in HanDic.
|
|
18
|
+
|
|
19
|
+
### basic
|
|
20
|
+
|
|
21
|
+
example:
|
|
22
|
+
|
|
23
|
+
```Python
|
|
24
|
+
import MeCab
|
|
25
|
+
import handic
|
|
26
|
+
import jamotools
|
|
27
|
+
|
|
28
|
+
mecaboption = f'-r /dev/null -d {handic.DICDIR}'
|
|
29
|
+
|
|
30
|
+
tokenizer = MeCab.Tagger(mecaboption)
|
|
31
|
+
tokenizer.parse('')
|
|
32
|
+
|
|
33
|
+
# 《표준국어대사전》 "형태소" 뜻풀이
|
|
34
|
+
sentence = u'뜻을 가진 가장 작은 말의 단위. ‘이야기책’의 ‘이야기’, ‘책’ 따위이다.'
|
|
35
|
+
|
|
36
|
+
jamo = jamotools.split_syllables(sentence, jamo_type="JAMO")
|
|
37
|
+
|
|
38
|
+
node = tokenizer.parseToNode(jamo)
|
|
39
|
+
while node:
|
|
40
|
+
print(node.surface, node.feature)
|
|
41
|
+
node = node.next
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
result:
|
|
45
|
+
|
|
46
|
+
```Shell
|
|
47
|
+
BOS/EOS,*,*,*,*,*,*,*,*,*,*
|
|
48
|
+
뜻 Noun,普通,*,*,*,뜻,뜻,*,*,B,NNG
|
|
49
|
+
을 Ending,助詞,対格,*,*,을02,을,*,*,*,JKO
|
|
50
|
+
가지 Verb,自立,*,語基2,*,가지다,가지,*,*,A,VV
|
|
51
|
+
ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
|
|
52
|
+
가장 Adverb,一般,*,*,*,가장01,가장,*,*,A,MAG
|
|
53
|
+
작으 Adjective,自立,*,語基2,*,작다01,작으,*,*,A,VA
|
|
54
|
+
ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
|
|
55
|
+
말 Noun,普通,動作,*,*,말01,말,*,*,A,NNG
|
|
56
|
+
의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
|
|
57
|
+
단위 Noun,普通,*,*,*,단위02,단위,單位,*,C,NNG
|
|
58
|
+
. Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
|
|
59
|
+
‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
|
|
60
|
+
이야기책 Noun,普通,*,*,*,이야기책,이야기책,이야기冊,*,*,NNG
|
|
61
|
+
’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
|
|
62
|
+
의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
|
|
63
|
+
‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
|
|
64
|
+
이야기 Noun,普通,動作,*,*,이야기,이야기,*,*,A,NNG
|
|
65
|
+
’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
|
|
66
|
+
, Symbol,コンマ,*,*,*,",",",",*,*,*,SP
|
|
67
|
+
‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
|
|
68
|
+
책 Noun,普通,*,*,*,책01,책,冊,*,A,NNG
|
|
69
|
+
’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
|
|
70
|
+
따위 Noun,依存名詞,*,*,*,따위,따위,*,*,*,NNB
|
|
71
|
+
이 Siteisi,非自立,*,語基1,*,이다,이,*,*,*,VCP
|
|
72
|
+
다 Ending,語尾,終止形,*,1接続,다06,다,*,*,*,EF
|
|
73
|
+
. Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
|
|
74
|
+
BOS/EOS,*,*,*,*,*,*,*,*,*,*
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Tokenize
|
|
78
|
+
|
|
79
|
+
example:
|
|
80
|
+
|
|
81
|
+
```Python
|
|
82
|
+
mecaboption = f'-r /dev/null -d {handic.DICDIR} -Otokenize'
|
|
83
|
+
tokenizer = MeCab.Tagger(mecaboption)
|
|
84
|
+
|
|
85
|
+
print(tokenizer.parse(jamo))
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
result:
|
|
89
|
+
|
|
90
|
+
```Shell
|
|
91
|
+
뜻 을 가지 ㄴ 가장 작으 ㄴ 말 의 단위 . ‘ 이야기책 ’ 의 ‘ 이야기 ’ , ‘ 책 ’ 따위 이 다 .
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Extracting specific POS
|
|
95
|
+
|
|
96
|
+
example:
|
|
97
|
+
|
|
98
|
+
```Python
|
|
99
|
+
mecaboption = f'-r /dev/null -d {handic.DICDIR}'
|
|
100
|
+
|
|
101
|
+
tokenizer = MeCab.Tagger(mecaboption)
|
|
102
|
+
tokenizer.parse('')
|
|
103
|
+
|
|
104
|
+
node = tokenizer.parseToNode(jamo)
|
|
105
|
+
while node:
|
|
106
|
+
# 일반명사(pos-tag: NNG)만 추출
|
|
107
|
+
if node.feature.split(',')[10] in ['NNG']:
|
|
108
|
+
print(node.feature.split(',')[5])
|
|
109
|
+
node = node.next
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
result:
|
|
113
|
+
|
|
114
|
+
```Shell
|
|
115
|
+
뜻
|
|
116
|
+
말01
|
|
117
|
+
단위02
|
|
118
|
+
이야기책
|
|
119
|
+
이야기
|
|
120
|
+
책01
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Features
|
|
124
|
+
|
|
125
|
+
Here is the list of features included in HanDic. For more information, see the [HanDic 품사 정보](https://github.com/okikirmui/handic/blob/main/docs/pos_detail.md).
|
|
126
|
+
|
|
127
|
+
- 품사1, 품사2, 품사3: part of speech(index: 0-2)
|
|
128
|
+
- 활용형: conjugation "base"(ex. `語基1`, `語基2`, `語基3`)(index: 3)
|
|
129
|
+
- 접속 정보: which "base" the ending is attached to(ex. `1接続`, `2接続`, etc.)(index: 4)
|
|
130
|
+
- 사전 항목: base forms(index: 5)
|
|
131
|
+
- 표층형: surface(index: 6)
|
|
132
|
+
- 한자: for sino-words(index: 7)
|
|
133
|
+
- 보충 정보: miscellaneous informations(index: 8)
|
|
134
|
+
- 학습 수준: learning level(index: 9)
|
|
135
|
+
- 세종계획 품사 태그: pos-tag(index: 10)
|
|
136
|
+
|
|
137
|
+
## License
|
|
138
|
+
|
|
139
|
+
This code is licensed under the MIT license. HanDic is copyright Yoshinori Sugai and distributed under the [BSD license](./LICENSE.handic).
|
|
140
|
+
|
|
141
|
+
## Acknowledgment
|
|
142
|
+
|
|
143
|
+
This repository is forked from [unidic-lite](https://github.com/polm/unidic-lite) with some modifications and file additions and deletions.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .handic import DICDIR, VERSION
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# HanDic
|
|
2
|
+
|
|
3
|
+
HanDic is a dictionary for morphological analysis of Korean languages with the morphological analysis engine MeCab. It consists of over 120,000 entries and was trained and built with data centered on written language, such as newspapers, news, novels, and textbooks.
|
|
4
|
+
|
|
5
|
+
For more information, please refer to [HanDic](https://github.com/okikirmui/handic).
|
|
6
|
+
|
|
7
|
+
HanDic is copyright Yoshinori Sugai and distributed under the BSD-3-Clause license.
|
|
Binary file
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
cost-factor = 800
|
|
2
|
+
bos-feature = BOS/EOS,*,*,*,*,*,*,*,*,*,*
|
|
3
|
+
eval-size = 5
|
|
4
|
+
unk-eval-size = 3
|
|
5
|
+
config-charset = utf8
|
|
6
|
+
|
|
7
|
+
; ChaSen for KH Coder
|
|
8
|
+
node-format-chasen = %f[6]\t%M\t%f[5]\t%F-[0,1,2]\t%f[3]\t%f[4]\n
|
|
9
|
+
unk-format-chasen = %f[6]\t%M\t%f[6]\t%F-[0,1,2]\t\t\n
|
|
10
|
+
eos-format-chasen = EOS\n
|
|
11
|
+
|
|
12
|
+
; Tokenize option for Korean
|
|
13
|
+
node-format-tokenize = %f[6]\s
|
|
14
|
+
unk-format-tokenize = %m\s
|
|
15
|
+
eos-format-tokenize = \n
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
v20241027
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
def get_version(dicdir):
|
|
5
|
+
vpath = os.path.join(dicdir, 'version')
|
|
6
|
+
with open(vpath) as vfile:
|
|
7
|
+
return vfile.read().strip()
|
|
8
|
+
|
|
9
|
+
_curdir = os.path.dirname(__file__)
|
|
10
|
+
|
|
11
|
+
# This will be used elsewhere to initialize the tagger
|
|
12
|
+
DICDIR = os.path.join(_curdir, 'dicdir')
|
|
13
|
+
VERSION = get_version(DICDIR)
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: handic
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: HanDic package for installing via pip.
|
|
5
|
+
Author: Yoshinori Sugai
|
|
6
|
+
Author-email: okikirmui+github@gmail.com
|
|
7
|
+
License: MIT License
|
|
8
|
+
Project-URL: Repository, https://github.com/okikirmui/handic-py
|
|
9
|
+
Keywords: handic,MeCab,Korean Language,morphological analysis,morphological analysis dictionary,korean text processing
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Natural Language :: Korean
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Text Processing
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
License-File: LICENSE.handic
|
|
19
|
+
|
|
20
|
+
# handic-py
|
|
21
|
+
|
|
22
|
+
This is a package to install [HanDic](https://github.com/okikirmui/handic), a dictionary for morphological analysis of Korean languages, via pip and use it in Python.
|
|
23
|
+
|
|
24
|
+
To use this package for morphological analysis, the MeCab wrapper such as [mecab-python3](https://github.com/SamuraiT/mecab-python3) is required.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
from PyPI:
|
|
29
|
+
|
|
30
|
+
```Shell
|
|
31
|
+
pip install handic
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
Since HanDic requires Hangul Jamo(Unicode Hangul Jamo) as input, please convert Hangul (Unicode Hangul Syllables) using modules such as [jamotools](https://pypi.org/project/jamotools/), or `tools/k2jamo.py` script included in HanDic.
|
|
37
|
+
|
|
38
|
+
### basic
|
|
39
|
+
|
|
40
|
+
example:
|
|
41
|
+
|
|
42
|
+
```Python
|
|
43
|
+
import MeCab
|
|
44
|
+
import handic
|
|
45
|
+
import jamotools
|
|
46
|
+
|
|
47
|
+
mecaboption = f'-r /dev/null -d {handic.DICDIR}'
|
|
48
|
+
|
|
49
|
+
tokenizer = MeCab.Tagger(mecaboption)
|
|
50
|
+
tokenizer.parse('')
|
|
51
|
+
|
|
52
|
+
# 《표준국어대사전》 "형태소" 뜻풀이
|
|
53
|
+
sentence = u'뜻을 가진 가장 작은 말의 단위. ‘이야기책’의 ‘이야기’, ‘책’ 따위이다.'
|
|
54
|
+
|
|
55
|
+
jamo = jamotools.split_syllables(sentence, jamo_type="JAMO")
|
|
56
|
+
|
|
57
|
+
node = tokenizer.parseToNode(jamo)
|
|
58
|
+
while node:
|
|
59
|
+
print(node.surface, node.feature)
|
|
60
|
+
node = node.next
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
result:
|
|
64
|
+
|
|
65
|
+
```Shell
|
|
66
|
+
BOS/EOS,*,*,*,*,*,*,*,*,*,*
|
|
67
|
+
뜻 Noun,普通,*,*,*,뜻,뜻,*,*,B,NNG
|
|
68
|
+
을 Ending,助詞,対格,*,*,을02,을,*,*,*,JKO
|
|
69
|
+
가지 Verb,自立,*,語基2,*,가지다,가지,*,*,A,VV
|
|
70
|
+
ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
|
|
71
|
+
가장 Adverb,一般,*,*,*,가장01,가장,*,*,A,MAG
|
|
72
|
+
작으 Adjective,自立,*,語基2,*,작다01,작으,*,*,A,VA
|
|
73
|
+
ᆫ Ending,語尾,連体形,*,2接続,ㄴ05,ㄴ,*,*,*,ETM
|
|
74
|
+
말 Noun,普通,動作,*,*,말01,말,*,*,A,NNG
|
|
75
|
+
의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
|
|
76
|
+
단위 Noun,普通,*,*,*,단위02,단위,單位,*,C,NNG
|
|
77
|
+
. Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
|
|
78
|
+
‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
|
|
79
|
+
이야기책 Noun,普通,*,*,*,이야기책,이야기책,이야기冊,*,*,NNG
|
|
80
|
+
’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
|
|
81
|
+
의 Ending,助詞,属格,*,*,의10,의,*,*,*,JKG
|
|
82
|
+
‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
|
|
83
|
+
이야기 Noun,普通,動作,*,*,이야기,이야기,*,*,A,NNG
|
|
84
|
+
’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
|
|
85
|
+
, Symbol,コンマ,*,*,*,",",",",*,*,*,SP
|
|
86
|
+
‘ Symbol,カッコ,引用符-始,*,*,‘,‘,*,*,*,SS
|
|
87
|
+
책 Noun,普通,*,*,*,책01,책,冊,*,A,NNG
|
|
88
|
+
’ Symbol,カッコ,引用符-終,*,*,’,’,*,*,*,SS
|
|
89
|
+
따위 Noun,依存名詞,*,*,*,따위,따위,*,*,*,NNB
|
|
90
|
+
이 Siteisi,非自立,*,語基1,*,이다,이,*,*,*,VCP
|
|
91
|
+
다 Ending,語尾,終止形,*,1接続,다06,다,*,*,*,EF
|
|
92
|
+
. Symbol,ピリオド,*,*,*,.,.,*,*,*,SF
|
|
93
|
+
BOS/EOS,*,*,*,*,*,*,*,*,*,*
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Tokenize
|
|
97
|
+
|
|
98
|
+
example:
|
|
99
|
+
|
|
100
|
+
```Python
|
|
101
|
+
mecaboption = f'-r /dev/null -d {handic.DICDIR} -Otokenize'
|
|
102
|
+
tokenizer = MeCab.Tagger(mecaboption)
|
|
103
|
+
|
|
104
|
+
print(tokenizer.parse(jamo))
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
result:
|
|
108
|
+
|
|
109
|
+
```Shell
|
|
110
|
+
뜻 을 가지 ㄴ 가장 작으 ㄴ 말 의 단위 . ‘ 이야기책 ’ 의 ‘ 이야기 ’ , ‘ 책 ’ 따위 이 다 .
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Extracting specific POS
|
|
114
|
+
|
|
115
|
+
example:
|
|
116
|
+
|
|
117
|
+
```Python
|
|
118
|
+
mecaboption = f'-r /dev/null -d {handic.DICDIR}'
|
|
119
|
+
|
|
120
|
+
tokenizer = MeCab.Tagger(mecaboption)
|
|
121
|
+
tokenizer.parse('')
|
|
122
|
+
|
|
123
|
+
node = tokenizer.parseToNode(jamo)
|
|
124
|
+
while node:
|
|
125
|
+
# 일반명사(pos-tag: NNG)만 추출
|
|
126
|
+
if node.feature.split(',')[10] in ['NNG']:
|
|
127
|
+
print(node.feature.split(',')[5])
|
|
128
|
+
node = node.next
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
result:
|
|
132
|
+
|
|
133
|
+
```Shell
|
|
134
|
+
뜻
|
|
135
|
+
말01
|
|
136
|
+
단위02
|
|
137
|
+
이야기책
|
|
138
|
+
이야기
|
|
139
|
+
책01
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Features
|
|
143
|
+
|
|
144
|
+
Here is the list of features included in HanDic. For more information, see the [HanDic 품사 정보](https://github.com/okikirmui/handic/blob/main/docs/pos_detail.md).
|
|
145
|
+
|
|
146
|
+
- 품사1, 품사2, 품사3: part of speech(index: 0-2)
|
|
147
|
+
- 활용형: conjugation "base"(ex. `語基1`, `語基2`, `語基3`)(index: 3)
|
|
148
|
+
- 접속 정보: which "base" the ending is attached to(ex. `1接続`, `2接続`, etc.)(index: 4)
|
|
149
|
+
- 사전 항목: base forms(index: 5)
|
|
150
|
+
- 표층형: surface(index: 6)
|
|
151
|
+
- 한자: for sino-words(index: 7)
|
|
152
|
+
- 보충 정보: miscellaneous informations(index: 8)
|
|
153
|
+
- 학습 수준: learning level(index: 9)
|
|
154
|
+
- 세종계획 품사 태그: pos-tag(index: 10)
|
|
155
|
+
|
|
156
|
+
## License
|
|
157
|
+
|
|
158
|
+
This code is licensed under the MIT license. HanDic is copyright Yoshinori Sugai and distributed under the [BSD license](./LICENSE.handic).
|
|
159
|
+
|
|
160
|
+
## Acknowledgment
|
|
161
|
+
|
|
162
|
+
This repository is forked from [unidic-lite](https://github.com/polm/unidic-lite) with some modifications and file additions and deletions.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
LICENSE.handic
|
|
3
|
+
MANIFEST.in
|
|
4
|
+
README.md
|
|
5
|
+
pyproject.toml
|
|
6
|
+
handic/__init__.py
|
|
7
|
+
handic/handic.py
|
|
8
|
+
handic.egg-info/PKG-INFO
|
|
9
|
+
handic.egg-info/SOURCES.txt
|
|
10
|
+
handic.egg-info/dependency_links.txt
|
|
11
|
+
handic.egg-info/top_level.txt
|
|
12
|
+
handic/dicdir/README.md
|
|
13
|
+
handic/dicdir/char.bin
|
|
14
|
+
handic/dicdir/dicrc
|
|
15
|
+
handic/dicdir/matrix.bin
|
|
16
|
+
handic/dicdir/mecabrc
|
|
17
|
+
handic/dicdir/sys.dic
|
|
18
|
+
handic/dicdir/unk.dic
|
|
19
|
+
handic/dicdir/version
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
handic
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools >= 61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "handic"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
requires-python = ">= 3.8"
|
|
9
|
+
authors = [
|
|
10
|
+
{name="Yoshinori Sugai"},
|
|
11
|
+
{email="okikirmui+github@gmail.com"}
|
|
12
|
+
]
|
|
13
|
+
description = "HanDic package for installing via pip."
|
|
14
|
+
readme = "README.md"
|
|
15
|
+
license = {text = "MIT License"}
|
|
16
|
+
keywords = [
|
|
17
|
+
"handic", "MeCab", "Korean Language", "morphological analysis", "morphological analysis dictionary", "korean text processing"
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python",
|
|
22
|
+
"Natural Language :: Korean",
|
|
23
|
+
"Operating System :: OS Independent",
|
|
24
|
+
"Topic :: Text Processing"
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Repository = "https://github.com/okikirmui/handic-py"
|
handic-0.1.0/setup.cfg
ADDED