nlpir 0.0.4-x86-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +195 -0
  5. data/Rakefile +11 -0
  6. data/bin/NLPIR.dll +0 -0
  7. data/lib/Data/BIG2GBK.map +0 -0
  8. data/lib/Data/BIG5.pdat +0 -0
  9. data/lib/Data/BIG5.wordlist +0 -0
  10. data/lib/Data/BiWord.big +0 -0
  11. data/lib/Data/Configure.xml +15 -0
  12. data/lib/Data/CoreDict.pdat +0 -0
  13. data/lib/Data/CoreDict.pos +0 -0
  14. data/lib/Data/CoreDict.unig +0 -0
  15. data/lib/Data/FieldDict.pdat +0 -0
  16. data/lib/Data/FieldDict.pos +0 -0
  17. data/lib/Data/GBK.pdat +0 -0
  18. data/lib/Data/GBK.wordlist +0 -0
  19. data/lib/Data/GBK2BIG.map +0 -0
  20. data/lib/Data/GBK2GBKC.map +0 -0
  21. data/lib/Data/GBK2UTF.map +0 -0
  22. data/lib/Data/GBKA.pdat +0 -0
  23. data/lib/Data/GBKA.wordlist +0 -0
  24. data/lib/Data/GBKA2UTF.map +0 -0
  25. data/lib/Data/GBKC.pdat +0 -0
  26. data/lib/Data/GBKC.wordlist +0 -0
  27. data/lib/Data/GBKC2GBK.map +0 -0
  28. data/lib/Data/GranDict.pdat +3369 -8
  29. data/lib/Data/GranDict.pos +0 -0
  30. data/lib/Data/ICTPOS.map +96 -0
  31. data/lib/Data/NLPIR.ctx +0 -0
  32. data/lib/Data/NLPIR.user +0 -0
  33. data/lib/Data/NLPIR_First.map +96 -0
  34. data/lib/Data/NewWord.lst +25 -0
  35. data/lib/Data/PKU.map +96 -0
  36. data/lib/Data/PKU_First.map +96 -0
  37. data/lib/Data/UTF2GBK.map +0 -0
  38. data/lib/Data/UTF2GBKA.map +0 -0
  39. data/lib/Data/UTF8.pdat +0 -0
  40. data/lib/Data/UTF8.wordlist +0 -0
  41. data/lib/Data/UserDict.pdat +0 -0
  42. data/lib/Data/charset.type +0 -0
  43. data/lib/Data/nr.ctx +0 -0
  44. data/lib/Data/nr.fsa +0 -0
  45. data/lib/Data/nr.role +0 -0
  46. data/lib/nlpir/version.rb +3 -0
  47. data/lib/nlpir.rb +275 -0
  48. data/nlpir.gemspec +25 -0
  49. data/test/Data/BIG2GBK.map +0 -0
  50. data/test/Data/BIG5.pdat +0 -0
  51. data/test/Data/BIG5.wordlist +0 -0
  52. data/test/Data/BiWord.big +0 -0
  53. data/test/Data/Configure.xml +15 -0
  54. data/test/Data/CoreDict.pdat +0 -0
  55. data/test/Data/CoreDict.pos +0 -0
  56. data/test/Data/CoreDict.unig +0 -0
  57. data/test/Data/FieldDict.pdat +0 -0
  58. data/test/Data/FieldDict.pos +0 -0
  59. data/test/Data/GBK.pdat +0 -0
  60. data/test/Data/GBK.wordlist +0 -0
  61. data/test/Data/GBK2BIG.map +0 -0
  62. data/test/Data/GBK2GBKC.map +0 -0
  63. data/test/Data/GBK2UTF.map +0 -0
  64. data/test/Data/GBKA.pdat +0 -0
  65. data/test/Data/GBKA.wordlist +0 -0
  66. data/test/Data/GBKA2UTF.map +0 -0
  67. data/test/Data/GBKC.pdat +0 -0
  68. data/test/Data/GBKC.wordlist +0 -0
  69. data/test/Data/GBKC2GBK.map +0 -0
  70. data/test/Data/GranDict.pdat +3369 -8
  71. data/test/Data/GranDict.pos +0 -0
  72. data/test/Data/ICTPOS.map +96 -0
  73. data/test/Data/NLPIR.ctx +0 -0
  74. data/test/Data/NLPIR.user +0 -0
  75. data/test/Data/NLPIR_First.map +96 -0
  76. data/test/Data/NewWord.lst +73 -0
  77. data/test/Data/PKU.map +96 -0
  78. data/test/Data/PKU_First.map +96 -0
  79. data/test/Data/UTF2GBK.map +0 -0
  80. data/test/Data/UTF2GBKA.map +0 -0
  81. data/test/Data/UTF8.pdat +0 -0
  82. data/test/Data/UTF8.wordlist +0 -0
  83. data/test/Data/UserDict.pdat +0 -0
  84. data/test/Data/charset.type +0 -0
  85. data/test/Data/nr.ctx +0 -0
  86. data/test/Data/nr.fsa +0 -0
  87. data/test/Data/nr.role +0 -0
  88. data/test/test.txt +52 -0
  89. data/test/test_nlpir.rb +158 -0
  90. data/test/test_result.txt +87 -0
  91. data/test/userdict.txt +5 -0
  92. metadata +206 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 65987eefe0d616b08e0f6659c43cd8b79469dab1
4
+ data.tar.gz: 9fe53a61bea4bd9a877665c6f4edc096c5f80365
5
+ SHA512:
6
+ metadata.gz: b08f5d4d63371d12f7a73bf810ab97f8e2288d52e4a60913c29ba8797c55567d6c5a96b95a243079e0ef57cb8a15a5e7188a954f37c7588d6314bb558ecc1367
7
+ data.tar.gz: b48b25cee53d3b7158acce346bd608eec8ce97628949d3740f3c133bee7efa86cfe53ba2fe1b0710e1d4c91472593aeef2f0f8aaa930e3c8b7bf8878e9953943
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 JoeWoo
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,195 @@
1
+ # Nlpir_win
2
+
3
+ A rubygem wrapper of chinese segment tools ICTCLAS2013
4
+
5
+ Nlpir version 0.0.4 , gem nlpir-0.0.4-x86-mingw32 support '>=ruby2.0.0' on win7.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'nlpir'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install nlpir
20
+
21
+ ## Usage
22
+
23
+
24
+ some DEFINE you may use :
25
+ ```ruby
26
+
27
+ NLPIR_FALSE = 0
28
+ NLPIR_TRUE = 1
29
+ POS_MAP_NUMBER = 4
30
+ ICT_POS_MAP_FIRST = 1 #计算所一级标注集
31
+ ICT_POS_MAP_SECOND = 0 #计算所二级标注集
32
+ PKU_POS_MAP_SECOND = 2 #北大二级标注集
33
+ PKU_POS_MAP_FIRST = 3 #北大一级标注集
34
+ POS_SIZE = 40
35
+
36
+ #词条结构体 term struct
37
+ Result_t = struct ['int start','int length',"char sPOS[#{POS_SIZE}]",'int iPOS',
38
+ 'int word_ID','int word_type','double weight']
39
+
40
+ GBK_CODE = 0 #GBK编码
41
+ UTF8_CODE = GBK_CODE + 1 #UTF8编码
42
+ BIG5_CODE = GBK_CODE + 2 #BIG5编码
43
+ GBK_FANTI_CODE = GBK_CODE + 3 #GBK编码,包含繁体字
44
+
45
+ ```
46
+
47
+ after you gem install it:
48
+
49
+ also can see some examples from test cases [here](https://github.com/JoeWoo/nlpir_win/blob/master/test/test_nlpir.rb)
50
+
51
+ ```ruby
52
+
53
+ require 'nlpir'
54
+ include Nlpir
55
+
56
+ s = "坚定不移沿着中国特色社会主义道路前进 为全面建成小康社会而奋斗"
57
+ #first of all : Call the NLPIR API NLPIR_Init
58
+
59
+ NLPIR_Init(nil, UTF8_CODE , File.expand_path("../", __FILE__))
60
+
61
+ #example1: Process a paragraph, and return the result text with POS or not
62
+ puts NLPIR_ParagraphProcess(s, NLPIR_TRUE)
63
+ puts NLPIR_ParagraphProcess(s, NLPIR_FALSE)
64
+
65
+ #example2: Process a paragraph, and return an array filled elements are POSed words.
66
+ #tips: NLPIR_ParagraphProcessA() return the array, and its memory is malloced by NLPIR, it will be freed by NLPIR_Exit() (memory in server)
67
+
68
+ words_list = NLPIR_ParagraphProcessA(s)
69
+ i=1
70
+ words_list.each do |a|
71
+ sWhichDic=""
72
+ case a.word_type
73
+ when 0
74
+ sWhichDic = "核心词典"
75
+ when 1
76
+ sWhichDic = "用户词典"
77
+ when 2
78
+ sWhichDic = "专业词典"
79
+ end
80
+ puts "No.#{i}:start:#{a.start}, length:#{a.length}, POS_ID:#{a.sPOS},word_ID:#{a.word_ID},word_type:#{a.word_type} , UserDefine:#{sWhichDic}, Word:#{s.byteslice(a.start,a.length)}, Weight:#{a.weight}\n"
81
+ i += 1
82
+ end
83
+
84
+ #example3: Process a paragraph, and return an array filled elements are POSed words.
85
+ #tips: NLPIR_ParagraphProcessAW() return the array, and its memory is malloced by ruby::fiddle,and be collect by GC (memory in agent)
86
+
87
+ words_list = NLPIR_ParagraphProcessAW(s)
88
+ i=1
89
+ words_list.each do |a|
90
+ sWhichDic=""
91
+ case a.word_type
92
+ when 0
93
+ sWhichDic = "核心词典"
94
+ when 1
95
+ sWhichDic = "用户词典"
96
+ when 2
97
+ sWhichDic = "专业词典"
98
+ end
99
+ puts "No.#{i}:start:#{a.start}, length:#{a.length}, POS_ID:#{a.sPOS},word_ID:#{a.word_ID},word_type:#{a.word_type} , UserDefine:#{sWhichDic}, Word:#{s.byteslice(a.start,a.length)}, Weight:#{a.weight}\n"
100
+ i += 1
101
+ end
102
+
103
+ #example4: Process a text file, and wirte the result text to file
104
+ puts NLPIR_FileProcess("./test.txt", "./test_result.txt", NULL)
105
+
106
+
107
+ #example5: Get ProcessAWordCount, it returns the count of the words
108
+ puts count = NLPIR_GetParagraphProcessAWordCount(s)
109
+
110
+
111
+ #example6: Add/Delete a word to the user dictionary (the path of user dictionary is ./data/userdict.dpat)
112
+ puts NLPIR_ParagraphProcess("我们都是爱思客")
113
+ #add a user word
114
+ NLPIR_AddUserWord("都是爱思客 n")
115
+ puts NLPIR_ParagraphProcess("我们都是爱思客")
116
+ #save the user word to disk
117
+ NLPIR_SaveTheUsrDic()
118
+ puts NLPIR_ParagraphProcess("我们都是爱思客")
119
+ #delete a user word
120
+ NLPIR_DelUsrWord("都是爱思课")
121
+ #save the change to disk
122
+ NLPIR_SaveTheUsrDic()
123
+
124
+
125
+ #example7: Import user-defined dictionary from a text file. and puts NLPIR result
126
+ puts NLPIR_ParagraphProcess("1989年春夏之交的政治风波1989年政治风波24小时降雪量24小时降雨量863计划ABC防护训练APEC会议BB机BP机C2系统C3I系统C3系统C4ISR系统C4I系统CCITT建议")
127
+ puts NLPIR_ImportUserDict("./userdict.txt")
128
+ NLPIR_AddUserWord("1989年春夏之交的政治风波 n")
129
+ #you can see the example file: ./test/userdict.txt to know the userdict`s format requirements
130
+ puts NLPIR_ParagraphProcess("1989年春夏之交的政治风波1989年政治风波24小时降雪量24小时降雨量863计划ABC防护训练APEC会议BB机BP机C2系统C3I系统C3系统C4ISR系统C4I系统CCITT建议")
131
+ NLPIR_DelUsrWord("1989年春夏之交的政治风波")
132
+ puts NLPIR_ParagraphProcess("1989年春夏之交的政治风波1989年政治风波24小时降雪量24小时降雨量863计划ABC防护训练APEC会议BB机BP机C2系统C3I系统C3系统C4ISR系统C4I系统CCITT建议")
133
+
134
+
135
+ #example8: Get keywords of text
136
+ #2nd parameter is the MaxNumber of keywords
137
+ #3rd parameter is a swith to show the WeightOut or not
138
+ puts NLPIR_GetKeyWords(s, 50,NLPIR_TRUE)
139
+
140
+
141
+ #example9: Get keywords from file
142
+ puts NLPIR_GetFileKeyWords("./test.txt",50, NLPIR_TRUE)
143
+
144
+
145
+ #example10: Find new words from text
146
+ puts NLPIR_GetNewWords(s, 50, NLPIR_TRUE)
147
+
148
+
149
+ #example11: Find new words from file
150
+ puts NLPIR_GetFileNewWords("./test.txt")
151
+
152
+
153
+ #example12: Extract a finger print from the paragraph
154
+ puts NLPIR_FingerPrint(s)
155
+
156
+
157
+ #example13: select which pos map will use
158
+ #ICT_POS_MAP_FIRST #//计算所一级标注集
159
+ #ICT_POS_MAP_SECOND #//计算所二级标注集
160
+ #PKU_POS_MAP_SECOND #//北大二级标注集
161
+ #PKU_POS_MAP_FIRST #//北大一级标注集
162
+ NLPIR_SetPOSmap(ICT_POS_MAP_FIRST)
163
+ puts NLPIR_ParagraphProcess(s)
164
+ NLPIR_SetPOSmap(PKU_POS_MAP_FIRST)
165
+ puts NLPIR_ParagraphProcess(s)
166
+
167
+
168
+
169
+ # 新词发现批量处理功能
170
+ #以下函数为2013版本专门针对新词发现的过程,一般建议脱机实现,不宜在线处理
171
+ # 新词识别完成后,再自动导入到分词系统中,即可完成
172
+
173
+ NLPIR_NWI_Start() #启动新词发现功能
174
+ f=File.new("test.txt", "r")
175
+ text=f.read
176
+ NLPIR_NWI_AddFile(text)#添加新词训练的文件,可反复添加
177
+ NLPIR_NWI_Complete()#添加文件或者训练内容结束
178
+ f.close()
179
+ puts NLPIR_NWI_GetResult()#输出新词识别结果
180
+ #puts NLPIR_FileProcess("a.txt","b.txt")
181
+ NLPIR_NWI_Result2UserDict()#新词识别结果导入到用户词典
182
+
183
+
184
+ #at the end call NLPIR_Exit() to free system materials
185
+ NLPIR_Exit()
186
+
187
+ ```
188
+
189
+ ## Contributing
190
+
191
+ 1. Fork it
192
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
193
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
194
+ 4. Push to the branch (`git push origin my-new-feature`)
195
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList['test/test*.rb']
7
+ t.verbose = true
8
+ end
9
+
10
+ desc "Run tests"
11
+ task :default => :test
data/bin/NLPIR.dll ADDED
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,15 @@
1
+ <?xmlversion="1.0"encoding="GB2312"?>
2
+ <NLPIR>
3
+ <TagSet>ICTPOS.map</TagSet>//���Ա�ע��ӳ���ļ�
4
+ <UserDict>on</UserDict>//On��UserDictionaryapplied;Off:notapplied��
5
+ <UserDictPrior>On</UserDictPrior>//�û��ʵ�����,Addedin2006-03-16,requiredbyNECOn���û��ʵ�ͺ��Ĵʵ���ͬʱ�еĴʻ㣬�û��ʵ����ȣ������ܲ�Ҫ���ã���������Ĵʵ��еĴʶ�����Ϊ�û��ʵ䣬��Ч���ʵ��䷴
6
+ <FieldDict>off</FieldDict>//On��FieldDictionaryapplied;Off:notapplied��
7
+ <GranularityContorl>off</GranularityContorl>
8
+ <Log>On</Log>//On,Off�����磺Off:�ر���־���ܣ�On:����־����
9
+ <version>2013</version>//ϵͳ�汾��
10
+ <Modify>2012-11-14</Modify>//ϵͳ����޶�ʱ��
11
+ <Lexicon>2012-11-14</Lexicon>//�ʵ�����޶�ʱ��
12
+ <adaptive>true</adaptive>//����Ӧ�ִʣ�Ĭ��Ϊfalse������Ӧ�ִʵ�Ч�ʻ�ϵ�
13
+ <author>�Ż�ƽ��ʿ</author>//����
14
+ <Contact>pipy_zhang@msn.com</Contact>//������ϵ��ʽ
15
+ </NLPIR>
Binary file
Binary file
Binary file
Binary file
Binary file
data/lib/Data/GBK.pdat ADDED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file