nlpir 0.0.4-x86-mingw32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/LICENSE.txt +22 -0
- data/README.md +195 -0
- data/Rakefile +11 -0
- data/bin/NLPIR.dll +0 -0
- data/lib/Data/BIG2GBK.map +0 -0
- data/lib/Data/BIG5.pdat +0 -0
- data/lib/Data/BIG5.wordlist +0 -0
- data/lib/Data/BiWord.big +0 -0
- data/lib/Data/Configure.xml +15 -0
- data/lib/Data/CoreDict.pdat +0 -0
- data/lib/Data/CoreDict.pos +0 -0
- data/lib/Data/CoreDict.unig +0 -0
- data/lib/Data/FieldDict.pdat +0 -0
- data/lib/Data/FieldDict.pos +0 -0
- data/lib/Data/GBK.pdat +0 -0
- data/lib/Data/GBK.wordlist +0 -0
- data/lib/Data/GBK2BIG.map +0 -0
- data/lib/Data/GBK2GBKC.map +0 -0
- data/lib/Data/GBK2UTF.map +0 -0
- data/lib/Data/GBKA.pdat +0 -0
- data/lib/Data/GBKA.wordlist +0 -0
- data/lib/Data/GBKA2UTF.map +0 -0
- data/lib/Data/GBKC.pdat +0 -0
- data/lib/Data/GBKC.wordlist +0 -0
- data/lib/Data/GBKC2GBK.map +0 -0
- data/lib/Data/GranDict.pdat +3369 -8
- data/lib/Data/GranDict.pos +0 -0
- data/lib/Data/ICTPOS.map +96 -0
- data/lib/Data/NLPIR.ctx +0 -0
- data/lib/Data/NLPIR.user +0 -0
- data/lib/Data/NLPIR_First.map +96 -0
- data/lib/Data/NewWord.lst +25 -0
- data/lib/Data/PKU.map +96 -0
- data/lib/Data/PKU_First.map +96 -0
- data/lib/Data/UTF2GBK.map +0 -0
- data/lib/Data/UTF2GBKA.map +0 -0
- data/lib/Data/UTF8.pdat +0 -0
- data/lib/Data/UTF8.wordlist +0 -0
- data/lib/Data/UserDict.pdat +0 -0
- data/lib/Data/charset.type +0 -0
- data/lib/Data/nr.ctx +0 -0
- data/lib/Data/nr.fsa +0 -0
- data/lib/Data/nr.role +0 -0
- data/lib/nlpir/version.rb +3 -0
- data/lib/nlpir.rb +275 -0
- data/nlpir.gemspec +25 -0
- data/test/Data/BIG2GBK.map +0 -0
- data/test/Data/BIG5.pdat +0 -0
- data/test/Data/BIG5.wordlist +0 -0
- data/test/Data/BiWord.big +0 -0
- data/test/Data/Configure.xml +15 -0
- data/test/Data/CoreDict.pdat +0 -0
- data/test/Data/CoreDict.pos +0 -0
- data/test/Data/CoreDict.unig +0 -0
- data/test/Data/FieldDict.pdat +0 -0
- data/test/Data/FieldDict.pos +0 -0
- data/test/Data/GBK.pdat +0 -0
- data/test/Data/GBK.wordlist +0 -0
- data/test/Data/GBK2BIG.map +0 -0
- data/test/Data/GBK2GBKC.map +0 -0
- data/test/Data/GBK2UTF.map +0 -0
- data/test/Data/GBKA.pdat +0 -0
- data/test/Data/GBKA.wordlist +0 -0
- data/test/Data/GBKA2UTF.map +0 -0
- data/test/Data/GBKC.pdat +0 -0
- data/test/Data/GBKC.wordlist +0 -0
- data/test/Data/GBKC2GBK.map +0 -0
- data/test/Data/GranDict.pdat +3369 -8
- data/test/Data/GranDict.pos +0 -0
- data/test/Data/ICTPOS.map +96 -0
- data/test/Data/NLPIR.ctx +0 -0
- data/test/Data/NLPIR.user +0 -0
- data/test/Data/NLPIR_First.map +96 -0
- data/test/Data/NewWord.lst +73 -0
- data/test/Data/PKU.map +96 -0
- data/test/Data/PKU_First.map +96 -0
- data/test/Data/UTF2GBK.map +0 -0
- data/test/Data/UTF2GBKA.map +0 -0
- data/test/Data/UTF8.pdat +0 -0
- data/test/Data/UTF8.wordlist +0 -0
- data/test/Data/UserDict.pdat +0 -0
- data/test/Data/charset.type +0 -0
- data/test/Data/nr.ctx +0 -0
- data/test/Data/nr.fsa +0 -0
- data/test/Data/nr.role +0 -0
- data/test/test.txt +52 -0
- data/test/test_nlpir.rb +158 -0
- data/test/test_result.txt +87 -0
- data/test/userdict.txt +5 -0
- metadata +206 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 65987eefe0d616b08e0f6659c43cd8b79469dab1
|
4
|
+
data.tar.gz: 9fe53a61bea4bd9a877665c6f4edc096c5f80365
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b08f5d4d63371d12f7a73bf810ab97f8e2288d52e4a60913c29ba8797c55567d6c5a96b95a243079e0ef57cb8a15a5e7188a954f37c7588d6314bb558ecc1367
|
7
|
+
data.tar.gz: b48b25cee53d3b7158acce346bd608eec8ce97628949d3740f3c133bee7efa86cfe53ba2fe1b0710e1d4c91472593aeef2f0f8aaa930e3c8b7bf8878e9953943
|
data/.gitignore
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 JoeWoo
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
# Nlpir_win
|
2
|
+
|
3
|
+
A rubygem wrapper of chinese segment tools ICTCLAS2013
|
4
|
+
|
5
|
+
Nlpir version 0.0.4 , gem nlpir-0.0.4-x86-mingw32 support '>=ruby2.0.0' on win7.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'nlpir'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install nlpir
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
|
24
|
+
some DEFINE you may use :
|
25
|
+
```ruby
|
26
|
+
|
27
|
+
NLPIR_FALSE = 0
|
28
|
+
NLPIR_TRUE = 1
|
29
|
+
POS_MAP_NUMBER = 4
|
30
|
+
ICT_POS_MAP_FIRST = 1 #计算所一级标注集
|
31
|
+
ICT_POS_MAP_SECOND = 0 #计算所二级标注集
|
32
|
+
PKU_POS_MAP_SECOND = 2 #北大二级标注集
|
33
|
+
PKU_POS_MAP_FIRST = 3 #北大一级标注集
|
34
|
+
POS_SIZE = 40
|
35
|
+
|
36
|
+
#词条结构体 term struct
|
37
|
+
Result_t = struct ['int start','int length',"char sPOS[#{POS_SIZE}]",'int iPOS',
|
38
|
+
'int word_ID','int word_type','double weight']
|
39
|
+
|
40
|
+
GBK_CODE = 0 #GBK编码
|
41
|
+
UTF8_CODE = GBK_CODE + 1 #UTF8编码
|
42
|
+
BIG5_CODE = GBK_CODE + 2 #BIG5编码
|
43
|
+
GBK_FANTI_CODE = GBK_CODE + 3 #GBK编码,包含繁体字
|
44
|
+
|
45
|
+
```
|
46
|
+
|
47
|
+
after you gem install it:
|
48
|
+
|
49
|
+
also can see some examples from test cases [here](https://github.com/JoeWoo/nlpir_win/blob/master/test/test_nlpir.rb)
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
|
53
|
+
require 'nlpir'
|
54
|
+
include Nlpir
|
55
|
+
|
56
|
+
s = "坚定不移沿着中国特色社会主义道路前进 为全面建成小康社会而奋斗"
|
57
|
+
#first of all : Call the NLPIR API NLPIR_Init
|
58
|
+
|
59
|
+
NLPIR_Init(nil, UTF8_CODE , File.expand_path("../", __FILE__))
|
60
|
+
|
61
|
+
#example1: Process a paragraph, and return the result text with POS or not
|
62
|
+
puts NLPIR_ParagraphProcess(s, NLPIR_TRUE)
|
63
|
+
puts NLPIR_ParagraphProcess(s, NLPIR_FALSE)
|
64
|
+
|
65
|
+
#example2: Process a paragraph, and return an array filled elements are POSed words.
|
66
|
+
#tips: NLPIR_ParagraphProcessA() return the array, and its memory is malloced by NLPIR, it will be freed by NLPIR_Exit() (memory in server)
|
67
|
+
|
68
|
+
words_list = NLPIR_ParagraphProcessA(s)
|
69
|
+
i=1
|
70
|
+
words_list.each do |a|
|
71
|
+
sWhichDic=""
|
72
|
+
case a.word_type
|
73
|
+
when 0
|
74
|
+
sWhichDic = "核心词典"
|
75
|
+
when 1
|
76
|
+
sWhichDic = "用户词典"
|
77
|
+
when 2
|
78
|
+
sWhichDic = "专业词典"
|
79
|
+
end
|
80
|
+
puts "No.#{i}:start:#{a.start}, length:#{a.length}, POS_ID:#{a.sPOS},word_ID:#{a.word_ID},word_type:#{a.word_type} , UserDefine:#{sWhichDic}, Word:#{s.byteslice(a.start,a.length)}, Weight:#{a.weight}\n"
|
81
|
+
i += 1
|
82
|
+
end
|
83
|
+
|
84
|
+
#example3: Process a paragraph, and return an array filled elements are POSed words.
|
85
|
+
#tips: NLPIR_ParagraphProcessAW() return the array, and its memory is malloced by ruby::fiddle,and be collect by GC (memory in agent)
|
86
|
+
|
87
|
+
words_list = NLPIR_ParagraphProcessAW(s)
|
88
|
+
i=1
|
89
|
+
words_list.each do |a|
|
90
|
+
sWhichDic=""
|
91
|
+
case a.word_type
|
92
|
+
when 0
|
93
|
+
sWhichDic = "核心词典"
|
94
|
+
when 1
|
95
|
+
sWhichDic = "用户词典"
|
96
|
+
when 2
|
97
|
+
sWhichDic = "专业词典"
|
98
|
+
end
|
99
|
+
puts "No.#{i}:start:#{a.start}, length:#{a.length}, POS_ID:#{a.sPOS},word_ID:#{a.word_ID},word_type:#{a.word_type} , UserDefine:#{sWhichDic}, Word:#{s.byteslice(a.start,a.length)}, Weight:#{a.weight}\n"
|
100
|
+
i += 1
|
101
|
+
end
|
102
|
+
|
103
|
+
#example4: Process a text file, and wirte the result text to file
|
104
|
+
puts NLPIR_FileProcess("./test.txt", "./test_result.txt", NULL)
|
105
|
+
|
106
|
+
|
107
|
+
#example5: Get ProcessAWordCount, it returns the count of the words
|
108
|
+
puts count = NLPIR_GetParagraphProcessAWordCount(s)
|
109
|
+
|
110
|
+
|
111
|
+
#example6: Add/Delete a word to the user dictionary (the path of user dictionary is ./data/userdict.dpat)
|
112
|
+
puts NLPIR_ParagraphProcess("我们都是爱思客")
|
113
|
+
#add a user word
|
114
|
+
NLPIR_AddUserWord("都是爱思客 n")
|
115
|
+
puts NLPIR_ParagraphProcess("我们都是爱思客")
|
116
|
+
#save the user word to disk
|
117
|
+
NLPIR_SaveTheUsrDic()
|
118
|
+
puts NLPIR_ParagraphProcess("我们都是爱思客")
|
119
|
+
#delete a user word
|
120
|
+
NLPIR_DelUsrWord("都是爱思课")
|
121
|
+
#save the change to disk
|
122
|
+
NLPIR_SaveTheUsrDic()
|
123
|
+
|
124
|
+
|
125
|
+
#example7: Import user-defined dictionary from a text file. and puts NLPIR result
|
126
|
+
puts NLPIR_ParagraphProcess("1989年春夏之交的政治风波1989年政治风波24小时降雪量24小时降雨量863计划ABC防护训练APEC会议BB机BP机C2系统C3I系统C3系统C4ISR系统C4I系统CCITT建议")
|
127
|
+
puts NLPIR_ImportUserDict("./userdict.txt")
|
128
|
+
NLPIR_AddUserWord("1989年春夏之交的政治风波 n")
|
129
|
+
#you can see the example file: ./test/userdict.txt to know the userdict`s format requirements
|
130
|
+
puts NLPIR_ParagraphProcess("1989年春夏之交的政治风波1989年政治风波24小时降雪量24小时降雨量863计划ABC防护训练APEC会议BB机BP机C2系统C3I系统C3系统C4ISR系统C4I系统CCITT建议")
|
131
|
+
NLPIR_DelUsrWord("1989年春夏之交的政治风波")
|
132
|
+
puts NLPIR_ParagraphProcess("1989年春夏之交的政治风波1989年政治风波24小时降雪量24小时降雨量863计划ABC防护训练APEC会议BB机BP机C2系统C3I系统C3系统C4ISR系统C4I系统CCITT建议")
|
133
|
+
|
134
|
+
|
135
|
+
#example8: Get keywords of text
|
136
|
+
#2nd parameter is the MaxNumber of keywords
|
137
|
+
#3rd parameter is a swith to show the WeightOut or not
|
138
|
+
puts NLPIR_GetKeyWords(s, 50,NLPIR_TRUE)
|
139
|
+
|
140
|
+
|
141
|
+
#example9: Get keywords from file
|
142
|
+
puts NLPIR_GetFileKeyWords("./test.txt",50, NLPIR_TRUE)
|
143
|
+
|
144
|
+
|
145
|
+
#example10: Find new words from text
|
146
|
+
puts NLPIR_GetNewWords(s, 50, NLPIR_TRUE)
|
147
|
+
|
148
|
+
|
149
|
+
#example11: Find new words from file
|
150
|
+
puts NLPIR_GetFileNewWords("./test.txt")
|
151
|
+
|
152
|
+
|
153
|
+
#example12: Extract a finger print from the paragraph
|
154
|
+
puts NLPIR_FingerPrint(s)
|
155
|
+
|
156
|
+
|
157
|
+
#example13: select which pos map will use
|
158
|
+
#ICT_POS_MAP_FIRST #//计算所一级标注集
|
159
|
+
#ICT_POS_MAP_SECOND #//计算所二级标注集
|
160
|
+
#PKU_POS_MAP_SECOND #//北大二级标注集
|
161
|
+
#PKU_POS_MAP_FIRST #//北大一级标注集
|
162
|
+
NLPIR_SetPOSmap(ICT_POS_MAP_FIRST)
|
163
|
+
puts NLPIR_ParagraphProcess(s)
|
164
|
+
NLPIR_SetPOSmap(PKU_POS_MAP_FIRST)
|
165
|
+
puts NLPIR_ParagraphProcess(s)
|
166
|
+
|
167
|
+
|
168
|
+
|
169
|
+
# 新词发现批量处理功能
|
170
|
+
#以下函数为2013版本专门针对新词发现的过程,一般建议脱机实现,不宜在线处理
|
171
|
+
# 新词识别完成后,再自动导入到分词系统中,即可完成
|
172
|
+
|
173
|
+
NLPIR_NWI_Start() #启动新词发现功能
|
174
|
+
f=File.new("test.txt", "r")
|
175
|
+
text=f.read
|
176
|
+
NLPIR_NWI_AddFile(text)#添加新词训练的文件,可反复添加
|
177
|
+
NLPIR_NWI_Complete()#添加文件或者训练内容结束
|
178
|
+
f.close()
|
179
|
+
puts NLPIR_NWI_GetResult()#输出新词识别结果
|
180
|
+
#puts NLPIR_FileProcess("a.txt","b.txt")
|
181
|
+
NLPIR_NWI_Result2UserDict()#新词识别结果导入到用户词典
|
182
|
+
|
183
|
+
|
184
|
+
#at the end call NLPIR_Exit() to free system materials
|
185
|
+
NLPIR_Exit()
|
186
|
+
|
187
|
+
```
|
188
|
+
|
189
|
+
## Contributing
|
190
|
+
|
191
|
+
1. Fork it
|
192
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
193
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
194
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
195
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/bin/NLPIR.dll
ADDED
Binary file
|
Binary file
|
data/lib/Data/BIG5.pdat
ADDED
Binary file
|
Binary file
|
data/lib/Data/BiWord.big
ADDED
Binary file
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<?xmlversion="1.0"encoding="GB2312"?>
|
2
|
+
<NLPIR>
|
3
|
+
<TagSet>ICTPOS.map</TagSet>//���Ա�ע��ӳ���ļ�
|
4
|
+
<UserDict>on</UserDict>//On��UserDictionaryapplied;Off:notapplied��
|
5
|
+
<UserDictPrior>On</UserDictPrior>//�û��ʵ�����,Addedin2006-03-16,requiredbyNECOn���û��ʵ�ͺ��Ĵʵ���ͬʱ�еĴʻ㣬�û��ʵ����ȣ������ܲ�Ҫ���ã���������Ĵʵ��еĴʶ�����Ϊ�û��ʵ䣬��Ч���ʵ��䷴
|
6
|
+
<FieldDict>off</FieldDict>//On��FieldDictionaryapplied;Off:notapplied��
|
7
|
+
<GranularityContorl>off</GranularityContorl>
|
8
|
+
<Log>On</Log>//On,Off�����磺Off:�ر���־���ܣ�On:����־����
|
9
|
+
<version>2013</version>//ϵͳ�汾��
|
10
|
+
<Modify>2012-11-14</Modify>//ϵͳ�����ʱ��
|
11
|
+
<Lexicon>2012-11-14</Lexicon>//�ʵ������ʱ��
|
12
|
+
<adaptive>true</adaptive>//����Ӧ�ִʣ�Ĭ��Ϊfalse������Ӧ�ִʵ�Ч�ʻ�ϵ�
|
13
|
+
<author>�Ż�ƽ��ʿ</author>//����
|
14
|
+
<Contact>pipy_zhang@msn.com</Contact>//������ϵ��ʽ
|
15
|
+
</NLPIR>
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/Data/GBK.pdat
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/Data/GBKA.pdat
ADDED
Binary file
|
Binary file
|
Binary file
|
data/lib/Data/GBKC.pdat
ADDED
Binary file
|
Binary file
|
Binary file
|