nlpir 0.0.4-x86-mingw32
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/LICENSE.txt +22 -0
- data/README.md +195 -0
- data/Rakefile +11 -0
- data/bin/NLPIR.dll +0 -0
- data/lib/Data/BIG2GBK.map +0 -0
- data/lib/Data/BIG5.pdat +0 -0
- data/lib/Data/BIG5.wordlist +0 -0
- data/lib/Data/BiWord.big +0 -0
- data/lib/Data/Configure.xml +15 -0
- data/lib/Data/CoreDict.pdat +0 -0
- data/lib/Data/CoreDict.pos +0 -0
- data/lib/Data/CoreDict.unig +0 -0
- data/lib/Data/FieldDict.pdat +0 -0
- data/lib/Data/FieldDict.pos +0 -0
- data/lib/Data/GBK.pdat +0 -0
- data/lib/Data/GBK.wordlist +0 -0
- data/lib/Data/GBK2BIG.map +0 -0
- data/lib/Data/GBK2GBKC.map +0 -0
- data/lib/Data/GBK2UTF.map +0 -0
- data/lib/Data/GBKA.pdat +0 -0
- data/lib/Data/GBKA.wordlist +0 -0
- data/lib/Data/GBKA2UTF.map +0 -0
- data/lib/Data/GBKC.pdat +0 -0
- data/lib/Data/GBKC.wordlist +0 -0
- data/lib/Data/GBKC2GBK.map +0 -0
- data/lib/Data/GranDict.pdat +3369 -8
- data/lib/Data/GranDict.pos +0 -0
- data/lib/Data/ICTPOS.map +96 -0
- data/lib/Data/NLPIR.ctx +0 -0
- data/lib/Data/NLPIR.user +0 -0
- data/lib/Data/NLPIR_First.map +96 -0
- data/lib/Data/NewWord.lst +25 -0
- data/lib/Data/PKU.map +96 -0
- data/lib/Data/PKU_First.map +96 -0
- data/lib/Data/UTF2GBK.map +0 -0
- data/lib/Data/UTF2GBKA.map +0 -0
- data/lib/Data/UTF8.pdat +0 -0
- data/lib/Data/UTF8.wordlist +0 -0
- data/lib/Data/UserDict.pdat +0 -0
- data/lib/Data/charset.type +0 -0
- data/lib/Data/nr.ctx +0 -0
- data/lib/Data/nr.fsa +0 -0
- data/lib/Data/nr.role +0 -0
- data/lib/nlpir/version.rb +3 -0
- data/lib/nlpir.rb +275 -0
- data/nlpir.gemspec +25 -0
- data/test/Data/BIG2GBK.map +0 -0
- data/test/Data/BIG5.pdat +0 -0
- data/test/Data/BIG5.wordlist +0 -0
- data/test/Data/BiWord.big +0 -0
- data/test/Data/Configure.xml +15 -0
- data/test/Data/CoreDict.pdat +0 -0
- data/test/Data/CoreDict.pos +0 -0
- data/test/Data/CoreDict.unig +0 -0
- data/test/Data/FieldDict.pdat +0 -0
- data/test/Data/FieldDict.pos +0 -0
- data/test/Data/GBK.pdat +0 -0
- data/test/Data/GBK.wordlist +0 -0
- data/test/Data/GBK2BIG.map +0 -0
- data/test/Data/GBK2GBKC.map +0 -0
- data/test/Data/GBK2UTF.map +0 -0
- data/test/Data/GBKA.pdat +0 -0
- data/test/Data/GBKA.wordlist +0 -0
- data/test/Data/GBKA2UTF.map +0 -0
- data/test/Data/GBKC.pdat +0 -0
- data/test/Data/GBKC.wordlist +0 -0
- data/test/Data/GBKC2GBK.map +0 -0
- data/test/Data/GranDict.pdat +3369 -8
- data/test/Data/GranDict.pos +0 -0
- data/test/Data/ICTPOS.map +96 -0
- data/test/Data/NLPIR.ctx +0 -0
- data/test/Data/NLPIR.user +0 -0
- data/test/Data/NLPIR_First.map +96 -0
- data/test/Data/NewWord.lst +73 -0
- data/test/Data/PKU.map +96 -0
- data/test/Data/PKU_First.map +96 -0
- data/test/Data/UTF2GBK.map +0 -0
- data/test/Data/UTF2GBKA.map +0 -0
- data/test/Data/UTF8.pdat +0 -0
- data/test/Data/UTF8.wordlist +0 -0
- data/test/Data/UserDict.pdat +0 -0
- data/test/Data/charset.type +0 -0
- data/test/Data/nr.ctx +0 -0
- data/test/Data/nr.fsa +0 -0
- data/test/Data/nr.role +0 -0
- data/test/test.txt +52 -0
- data/test/test_nlpir.rb +158 -0
- data/test/test_result.txt +87 -0
- data/test/userdict.txt +5 -0
- metadata +206 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 65987eefe0d616b08e0f6659c43cd8b79469dab1
|
4
|
+
data.tar.gz: 9fe53a61bea4bd9a877665c6f4edc096c5f80365
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b08f5d4d63371d12f7a73bf810ab97f8e2288d52e4a60913c29ba8797c55567d6c5a96b95a243079e0ef57cb8a15a5e7188a954f37c7588d6314bb558ecc1367
|
7
|
+
data.tar.gz: b48b25cee53d3b7158acce346bd608eec8ce97628949d3740f3c133bee7efa86cfe53ba2fe1b0710e1d4c91472593aeef2f0f8aaa930e3c8b7bf8878e9953943
|
data/.gitignore
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 JoeWoo
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
# Nlpir_win
|
2
|
+
|
3
|
+
A rubygem wrapper of chinese segment tools ICTCLAS2013
|
4
|
+
|
5
|
+
Nlpir version 0.0.4 , gem nlpir-0.0.4-x86-mingw32 support '>=ruby2.0.0' on win7.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'nlpir'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install nlpir
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
|
24
|
+
some DEFINE you may use :
|
25
|
+
```ruby
|
26
|
+
|
27
|
+
NLPIR_FALSE = 0
|
28
|
+
NLPIR_TRUE = 1
|
29
|
+
POS_MAP_NUMBER = 4
|
30
|
+
ICT_POS_MAP_FIRST = 1 #计算所一级标注集
|
31
|
+
ICT_POS_MAP_SECOND = 0 #计算所二级标注集
|
32
|
+
PKU_POS_MAP_SECOND = 2 #北大二级标注集
|
33
|
+
PKU_POS_MAP_FIRST = 3 #北大一级标注集
|
34
|
+
POS_SIZE = 40
|
35
|
+
|
36
|
+
#词条结构体 term struct
|
37
|
+
Result_t = struct ['int start','int length',"char sPOS[#{POS_SIZE}]",'int iPOS',
|
38
|
+
'int word_ID','int word_type','double weight']
|
39
|
+
|
40
|
+
GBK_CODE = 0 #GBK编码
|
41
|
+
UTF8_CODE = GBK_CODE + 1 #UTF8编码
|
42
|
+
BIG5_CODE = GBK_CODE + 2 #BIG5编码
|
43
|
+
GBK_FANTI_CODE = GBK_CODE + 3 #GBK编码,包含繁体字
|
44
|
+
|
45
|
+
```
|
46
|
+
|
47
|
+
after you gem install it:
|
48
|
+
|
49
|
+
also can see some examples from test cases [here](https://github.com/JoeWoo/nlpir_win/blob/master/test/test_nlpir.rb)
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
|
53
|
+
require 'nlpir'
|
54
|
+
include Nlpir
|
55
|
+
|
56
|
+
s = "坚定不移沿着中国特色社会主义道路前进 为全面建成小康社会而奋斗"
|
57
|
+
#first of all : Call the NLPIR API NLPIR_Init
|
58
|
+
|
59
|
+
NLPIR_Init(nil, UTF8_CODE , File.expand_path("../", __FILE__))
|
60
|
+
|
61
|
+
#example1: Process a paragraph, and return the result text with POS or not
|
62
|
+
puts NLPIR_ParagraphProcess(s, NLPIR_TRUE)
|
63
|
+
puts NLPIR_ParagraphProcess(s, NLPIR_FALSE)
|
64
|
+
|
65
|
+
#example2: Process a paragraph, and return an array filled elements are POSed words.
|
66
|
+
#tips: NLPIR_ParagraphProcessA() return the array, and its memory is malloced by NLPIR, it will be freed by NLPIR_Exit() (memory in server)
|
67
|
+
|
68
|
+
words_list = NLPIR_ParagraphProcessA(s)
|
69
|
+
i=1
|
70
|
+
words_list.each do |a|
|
71
|
+
sWhichDic=""
|
72
|
+
case a.word_type
|
73
|
+
when 0
|
74
|
+
sWhichDic = "核心词典"
|
75
|
+
when 1
|
76
|
+
sWhichDic = "用户词典"
|
77
|
+
when 2
|
78
|
+
sWhichDic = "专业词典"
|
79
|
+
end
|
80
|
+
puts "No.#{i}:start:#{a.start}, length:#{a.length}, POS_ID:#{a.sPOS},word_ID:#{a.word_ID},word_type:#{a.word_type} , UserDefine:#{sWhichDic}, Word:#{s.byteslice(a.start,a.length)}, Weight:#{a.weight}\n"
|
81
|
+
i += 1
|
82
|
+
end
|
83
|
+
|
84
|
+
#example3: Process a paragraph, and return an array filled elements are POSed words.
|
85
|
+
#tips: NLPIR_ParagraphProcessAW() return the array, and its memory is malloced by ruby::fiddle,and be collect by GC (memory in agent)
|
86
|
+
|
87
|
+
words_list = NLPIR_ParagraphProcessAW(s)
|
88
|
+
i=1
|
89
|
+
words_list.each do |a|
|
90
|
+
sWhichDic=""
|
91
|
+
case a.word_type
|
92
|
+
when 0
|
93
|
+
sWhichDic = "核心词典"
|
94
|
+
when 1
|
95
|
+
sWhichDic = "用户词典"
|
96
|
+
when 2
|
97
|
+
sWhichDic = "专业词典"
|
98
|
+
end
|
99
|
+
puts "No.#{i}:start:#{a.start}, length:#{a.length}, POS_ID:#{a.sPOS},word_ID:#{a.word_ID},word_type:#{a.word_type} , UserDefine:#{sWhichDic}, Word:#{s.byteslice(a.start,a.length)}, Weight:#{a.weight}\n"
|
100
|
+
i += 1
|
101
|
+
end
|
102
|
+
|
103
|
+
#example4: Process a text file, and wirte the result text to file
|
104
|
+
puts NLPIR_FileProcess("./test.txt", "./test_result.txt", NULL)
|
105
|
+
|
106
|
+
|
107
|
+
#example5: Get ProcessAWordCount, it returns the count of the words
|
108
|
+
puts count = NLPIR_GetParagraphProcessAWordCount(s)
|
109
|
+
|
110
|
+
|
111
|
+
#example6: Add/Delete a word to the user dictionary (the path of user dictionary is ./data/userdict.dpat)
|
112
|
+
puts NLPIR_ParagraphProcess("我们都是爱思客")
|
113
|
+
#add a user word
|
114
|
+
NLPIR_AddUserWord("都是爱思客 n")
|
115
|
+
puts NLPIR_ParagraphProcess("我们都是爱思客")
|
116
|
+
#save the user word to disk
|
117
|
+
NLPIR_SaveTheUsrDic()
|
118
|
+
puts NLPIR_ParagraphProcess("我们都是爱思客")
|
119
|
+
#delete a user word
|
120
|
+
NLPIR_DelUsrWord("都是爱思课")
|
121
|
+
#save the change to disk
|
122
|
+
NLPIR_SaveTheUsrDic()
|
123
|
+
|
124
|
+
|
125
|
+
#example7: Import user-defined dictionary from a text file. and puts NLPIR result
|
126
|
+
puts NLPIR_ParagraphProcess("1989年春夏之交的政治风波1989年政治风波24小时降雪量24小时降雨量863计划ABC防护训练APEC会议BB机BP机C2系统C3I系统C3系统C4ISR系统C4I系统CCITT建议")
|
127
|
+
puts NLPIR_ImportUserDict("./userdict.txt")
|
128
|
+
NLPIR_AddUserWord("1989年春夏之交的政治风波 n")
|
129
|
+
#you can see the example file: ./test/userdict.txt to know the userdict`s format requirements
|
130
|
+
puts NLPIR_ParagraphProcess("1989年春夏之交的政治风波1989年政治风波24小时降雪量24小时降雨量863计划ABC防护训练APEC会议BB机BP机C2系统C3I系统C3系统C4ISR系统C4I系统CCITT建议")
|
131
|
+
NLPIR_DelUsrWord("1989年春夏之交的政治风波")
|
132
|
+
puts NLPIR_ParagraphProcess("1989年春夏之交的政治风波1989年政治风波24小时降雪量24小时降雨量863计划ABC防护训练APEC会议BB机BP机C2系统C3I系统C3系统C4ISR系统C4I系统CCITT建议")
|
133
|
+
|
134
|
+
|
135
|
+
#example8: Get keywords of text
|
136
|
+
#2nd parameter is the MaxNumber of keywords
|
137
|
+
#3rd parameter is a swith to show the WeightOut or not
|
138
|
+
puts NLPIR_GetKeyWords(s, 50,NLPIR_TRUE)
|
139
|
+
|
140
|
+
|
141
|
+
#example9: Get keywords from file
|
142
|
+
puts NLPIR_GetFileKeyWords("./test.txt",50, NLPIR_TRUE)
|
143
|
+
|
144
|
+
|
145
|
+
#example10: Find new words from text
|
146
|
+
puts NLPIR_GetNewWords(s, 50, NLPIR_TRUE)
|
147
|
+
|
148
|
+
|
149
|
+
#example11: Find new words from file
|
150
|
+
puts NLPIR_GetFileNewWords("./test.txt")
|
151
|
+
|
152
|
+
|
153
|
+
#example12: Extract a finger print from the paragraph
|
154
|
+
puts NLPIR_FingerPrint(s)
|
155
|
+
|
156
|
+
|
157
|
+
#example13: select which pos map will use
|
158
|
+
#ICT_POS_MAP_FIRST #//计算所一级标注集
|
159
|
+
#ICT_POS_MAP_SECOND #//计算所二级标注集
|
160
|
+
#PKU_POS_MAP_SECOND #//北大二级标注集
|
161
|
+
#PKU_POS_MAP_FIRST #//北大一级标注集
|
162
|
+
NLPIR_SetPOSmap(ICT_POS_MAP_FIRST)
|
163
|
+
puts NLPIR_ParagraphProcess(s)
|
164
|
+
NLPIR_SetPOSmap(PKU_POS_MAP_FIRST)
|
165
|
+
puts NLPIR_ParagraphProcess(s)
|
166
|
+
|
167
|
+
|
168
|
+
|
169
|
+
# 新词发现批量处理功能
|
170
|
+
#以下函数为2013版本专门针对新词发现的过程,一般建议脱机实现,不宜在线处理
|
171
|
+
# 新词识别完成后,再自动导入到分词系统中,即可完成
|
172
|
+
|
173
|
+
NLPIR_NWI_Start() #启动新词发现功能
|
174
|
+
f=File.new("test.txt", "r")
|
175
|
+
text=f.read
|
176
|
+
NLPIR_NWI_AddFile(text)#添加新词训练的文件,可反复添加
|
177
|
+
NLPIR_NWI_Complete()#添加文件或者训练内容结束
|
178
|
+
f.close()
|
179
|
+
puts NLPIR_NWI_GetResult()#输出新词识别结果
|
180
|
+
#puts NLPIR_FileProcess("a.txt","b.txt")
|
181
|
+
NLPIR_NWI_Result2UserDict()#新词识别结果导入到用户词典
|
182
|
+
|
183
|
+
|
184
|
+
#at the end call NLPIR_Exit() to free system materials
|
185
|
+
NLPIR_Exit()
|
186
|
+
|
187
|
+
```
|
188
|
+
|
189
|
+
## Contributing
|
190
|
+
|
191
|
+
1. Fork it
|
192
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
193
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
194
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
195
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/bin/NLPIR.dll
ADDED
Binary file
|
Binary file
|
data/lib/Data/BIG5.pdat
ADDED
Binary file
|
Binary file
|
data/lib/Data/BiWord.big
ADDED
Binary file
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<?xmlversion="1.0"encoding="GB2312"?>
|
2
|
+
<NLPIR>
|
3
|
+
<TagSet>ICTPOS.map</TagSet>//���Ա�ע��ӳ���ļ�
|
4
|
+
<UserDict>on</UserDict>//On��UserDictionaryapplied;Off:notapplied��
|
5
|
+
<UserDictPrior>On</UserDictPrior>//�û��ʵ�����,Addedin2006-03-16,requiredbyNECOn���û��ʵ�ͺ��Ĵʵ���ͬʱ�еĴʻ㣬�û��ʵ����ȣ������ܲ�Ҫ���ã���������Ĵʵ��еĴʶ�����Ϊ�û��ʵ䣬��Ч���ʵ��䷴
|
6
|
+
<FieldDict>off</FieldDict>//On��FieldDictionaryapplied;Off:notapplied��
|
7
|
+
<GranularityContorl>off</GranularityContorl>
|
8
|
+
<Log>On</Log>//On,Off�����磺Off:�ر���־���ܣ�On:����־����
|
9
|
+
<version>2013</version>//ϵͳ�汾��
|
10
|
+
<Modify>2012-11-14</Modify>//ϵͳ�����ʱ��
|
11
|
+
<Lexicon>2012-11-14</Lexicon>//�ʵ������ʱ��
|
12
|
+
<adaptive>true</adaptive>//����Ӧ�ִʣ�Ĭ��Ϊfalse������Ӧ�ִʵ�Ч�ʻ�ϵ�
|
13
|
+
<author>�Ż�ƽ��ʿ</author>//����
|
14
|
+
<Contact>pipy_zhang@msn.com</Contact>//������ϵ��ʽ
|
15
|
+
</NLPIR>
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/Data/GBK.pdat
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/Data/GBKA.pdat
ADDED
Binary file
|
Binary file
|
Binary file
|
data/lib/Data/GBKC.pdat
ADDED
Binary file
|
Binary file
|
Binary file
|