simhash_tim_modified 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +28 -0
- data/README.rdoc +53 -0
- data/Rakefile +46 -0
- data/ext/string_hashing/extconf.rb +49 -0
- data/ext/string_hashing/string_hashing.c +64 -0
- data/lib/integer.rb +17 -0
- data/lib/simhash/stopwords/chinese_stopword.txt +506 -0
- data/lib/simhash/stopwords/en.rb +6 -0
- data/lib/simhash/stopwords/ru.rb +6 -0
- data/lib/simhash/stopwords/zh.rb +6 -0
- data/lib/simhash/stopwords.rb +10 -0
- data/lib/simhash.rb +76 -0
- data/lib/string.rb +41 -0
- metadata +156 -0
data/LICENSE
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
|
2
|
+
LICENSE
|
3
|
+
|
4
|
+
The MIT License
|
5
|
+
|
6
|
+
Copyright (c) 2002 Charikar, Simhash algorythm
|
7
|
+
Copyright (c) 2009 Andre Hagenbruch, Python implementation
|
8
|
+
Copyright (c) 2010 Bookmate.ru
|
9
|
+
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
12
|
+
in the Software without restriction, including without limitation the rights
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
15
|
+
furnished to do so, subject to the following conditions:
|
16
|
+
|
17
|
+
The above copyright notice and this permission notice shall be included in
|
18
|
+
all copies or substantial portions of the Software.
|
19
|
+
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
26
|
+
THE SOFTWARE.
|
27
|
+
|
28
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
==Absctract
|
2
|
+
|
3
|
+
This is implementation of {Moses Charikar's simhashes}[http://portal.acm.org/citation.cfm?id=509965] in Ruby.
|
4
|
+
|
5
|
+
==Usage
|
6
|
+
|
7
|
+
When you have a string and want to calculate it's simhash, you should
|
8
|
+
|
9
|
+
my_string.simhash
|
10
|
+
|
11
|
+
By default it will generate 64-bit integer - that is simhash for this string
|
12
|
+
|
13
|
+
It's always better to tokenize string before simhashing. It's as simple as
|
14
|
+
|
15
|
+
my_string.simhash(:split_by => / /)
|
16
|
+
|
17
|
+
This will generate 64-bit integer based, but will split string into words before.
|
18
|
+
It's handy when you need to calculate similarity of strings based on word usage.
|
19
|
+
You can split string as you like: by letters/sentences/specific letter-combinations, etc.
|
20
|
+
|
21
|
+
my_string.simhash(:split_by => /./, :bitlength => 512)
|
22
|
+
|
23
|
+
Sometimes you might need longer simhash (finding similarity for very long strings is a good example).
|
24
|
+
You can set length of result hash by passing bitlength parameter. This example will return 512-bit simhash
|
25
|
+
for your string splitted by sentences.
|
26
|
+
|
27
|
+
==Advanced usage
|
28
|
+
|
29
|
+
It's useful to clean your string before simhashing. But it's useful not to clean, too.
|
30
|
+
|
31
|
+
Here are examples:
|
32
|
+
|
33
|
+
my_string.simhash(:stop_words => true) # here we clean
|
34
|
+
|
35
|
+
This will find stop-words in your string and remove them before simhashing. Stop-words are "the", "not", "about", etc.
|
36
|
+
Currently we remove only Russian and English stop-words.
|
37
|
+
|
38
|
+
my_string.simhash(:preserve_punctuation => true) # here we not
|
39
|
+
|
40
|
+
This will not remove punctuation before simhashing. Yes, we remove all dots, commas, etc. after splitting string to words by default.
|
41
|
+
Because different punctiation does not mean difference in general. If you not agree you can turn this default off.
|
42
|
+
|
43
|
+
==Installation
|
44
|
+
|
45
|
+
As usual:
|
46
|
+
|
47
|
+
gem install simhash
|
48
|
+
|
49
|
+
But if you have {GNU MP library}[http://gmplib.org/], simhash will work faster! To check out which version is used, type:
|
50
|
+
|
51
|
+
Simhash::DEFAULT_STRING_HASH_METHOD
|
52
|
+
|
53
|
+
It should return symbol. If symbol ends with "rb", your simhash is slow. If you want make it faster, install GNU MP.
|
data/Rakefile
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
|
5
|
+
$LOAD_PATH << File.join(File.dirname(__FILE__), 'lib')
|
6
|
+
require 'simhash'
|
7
|
+
|
8
|
+
desc 'Default: run unit tests.'
|
9
|
+
task :default => [:test]
|
10
|
+
|
11
|
+
desc 'Test the simhash gem'
|
12
|
+
Rake::TestTask.new(:test) do |t|
|
13
|
+
t.libs << '.'
|
14
|
+
t.pattern = 'test/**/*_test.rb'
|
15
|
+
t.verbose = true
|
16
|
+
end
|
17
|
+
|
18
|
+
desc 'Start an IRB session with all necessary files required.'
|
19
|
+
task :shell do |t|
|
20
|
+
chdir File.dirname(__FILE__)
|
21
|
+
exec 'irb -I lib/ -I lib/simhash -I lib/string -I lib/integer -r rubygems'
|
22
|
+
end
|
23
|
+
|
24
|
+
desc 'Build the gemspec.'
|
25
|
+
task :gemspec do |t|
|
26
|
+
exec 'gem build simhash.gemspec'
|
27
|
+
end
|
28
|
+
|
29
|
+
desc "Print a list of the files to be put into the gem"
|
30
|
+
task :manifest do
|
31
|
+
spec.files.each do |file|
|
32
|
+
puts file
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
desc "Generate a gemspec file for GitHub"
|
37
|
+
task :gemspec do
|
38
|
+
File.open("#{spec.name}.gemspec", 'w') do |f|
|
39
|
+
f.write spec.to_ruby
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
desc "Build the gem into the current directory"
|
44
|
+
task :gem => :gemspec do
|
45
|
+
`gem build #{spec.name}.gemspec`
|
46
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'mkmf'
|
3
|
+
|
4
|
+
extension_name = 'string_hashing'
|
5
|
+
# should link against the libgmp library
|
6
|
+
$LDFLAGS << ' -lgmp'
|
7
|
+
|
8
|
+
# Sort out the universal vs. single-archicture build problems on MacOS X
|
9
|
+
if RUBY_PLATFORM.include?( 'darwin' )
|
10
|
+
puts "MacOS X build: fixing architecture flags:"
|
11
|
+
|
12
|
+
commonflags = nil
|
13
|
+
if ENV['ARCHFLAGS']
|
14
|
+
puts " using the value in ARCHFLAGS environment variable (%p)." % [ ENV['ARCHFLAGS'] ]
|
15
|
+
commonflags = ENV['ARCHFLAGS']
|
16
|
+
else
|
17
|
+
$stderr.puts %{
|
18
|
+
=========== WARNING ===========
|
19
|
+
|
20
|
+
You are building this extension on OS X without setting the
|
21
|
+
ARCHFLAGS environment variable.
|
22
|
+
|
23
|
+
If you are seeing this message, that means that the
|
24
|
+
build will probably fail.
|
25
|
+
|
26
|
+
===================================
|
27
|
+
}.gsub( /^\t+/, ' ' )
|
28
|
+
end
|
29
|
+
|
30
|
+
if commonflags
|
31
|
+
$CFLAGS.gsub!( /-arch\s+\S+ /, '' )
|
32
|
+
$LDFLAGS.gsub!( /-arch\s+\S+ /, '' )
|
33
|
+
CONFIG['LDSHARED'].gsub!( /-arch\s+\S+ /, '' )
|
34
|
+
|
35
|
+
$CFLAGS << ' ' << commonflags
|
36
|
+
$LDFLAGS << ' ' << commonflags
|
37
|
+
CONFIG['LDSHARED'] << ' ' << commonflags
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
if (find_header("gmp.h") rescue false)
|
42
|
+
$stderr.puts "Configuring extensions"
|
43
|
+
dir_config(extension_name)
|
44
|
+
create_makefile(extension_name)
|
45
|
+
else
|
46
|
+
$stderr.puts "Skipping building of C extension"
|
47
|
+
# creating foo Makefile to avoid building stuff
|
48
|
+
File.open(File.join(File.dirname(__FILE__), "Makefile"), "w"){|f| f.write("all: \ninstall: \n")}
|
49
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <gmp.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
|
5
|
+
VALUE StringHashing = Qnil;
|
6
|
+
|
7
|
+
void Init_string_hashing();
|
8
|
+
|
9
|
+
VALUE method_hash_vl(VALUE self, VALUE bitlength);
|
10
|
+
|
11
|
+
void Init_string_hashing() {
|
12
|
+
rb_define_method(rb_cString, "hash_vl", method_hash_vl, 1);
|
13
|
+
}
|
14
|
+
|
15
|
+
VALUE method_hash_vl(VALUE self, VALUE bitlength) {
|
16
|
+
int bl = NUM2INT(bitlength);
|
17
|
+
|
18
|
+
// for hard typecasting
|
19
|
+
unsigned char one_char;
|
20
|
+
char* result;
|
21
|
+
result = malloc(bl*sizeof(char));
|
22
|
+
unsigned long long len = RSTRING_LEN(self);
|
23
|
+
char *string = RSTRING_PTR(self);
|
24
|
+
|
25
|
+
if(len == 0){ return 0; }
|
26
|
+
|
27
|
+
mpz_t x, mask, long_len;
|
28
|
+
mpz_init_set_ui (long_len, len);
|
29
|
+
one_char = RSTRING_PTR(self)[0];
|
30
|
+
mpz_init_set_ui (x, one_char << 7);
|
31
|
+
int m = 1000003;
|
32
|
+
|
33
|
+
// generating mask of length bitlength filled with 1
|
34
|
+
mpz_init (mask);
|
35
|
+
mpz_ui_pow_ui(mask, 2, bl);
|
36
|
+
mpz_sub_ui (mask, mask, 1);
|
37
|
+
|
38
|
+
mpz_t computations, byte;
|
39
|
+
mpz_init(computations);
|
40
|
+
mpz_init2 (byte, 8);
|
41
|
+
|
42
|
+
int i = 0;
|
43
|
+
for(i; i < len; i++) {
|
44
|
+
one_char = string[i];
|
45
|
+
mpz_set_ui(byte, one_char);
|
46
|
+
mpz_mul_ui(computations, x, m);
|
47
|
+
mpz_xor(computations, computations, byte);
|
48
|
+
mpz_and (x, mask, computations);
|
49
|
+
}
|
50
|
+
|
51
|
+
mpz_xor(x, x, long_len);
|
52
|
+
//gmp_printf ("C xored x is %Zd\n", x);
|
53
|
+
mpz_get_str (result, 10, x);
|
54
|
+
VALUE res = rb_str_new2(result);
|
55
|
+
|
56
|
+
mpz_clear(x);
|
57
|
+
mpz_clear(byte);
|
58
|
+
mpz_clear(computations);
|
59
|
+
mpz_clear(mask);
|
60
|
+
mpz_clear(long_len);
|
61
|
+
free(result);
|
62
|
+
|
63
|
+
return res;
|
64
|
+
}
|
data/lib/integer.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
class Integer
|
3
|
+
# Hamming distance – number of different bits in same positions
|
4
|
+
# H(1001, 1110) = 3
|
5
|
+
# H(1001, 1000) = 1
|
6
|
+
def hamming_distance_to(integer)
|
7
|
+
total = 0
|
8
|
+
difference = self ^ integer
|
9
|
+
|
10
|
+
while difference > 0 do
|
11
|
+
total += 1 if (difference & 1).nonzero?
|
12
|
+
difference >>= 1
|
13
|
+
end
|
14
|
+
|
15
|
+
total
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,506 @@
|
|
1
|
+
?
|
2
|
+
、
|
3
|
+
。
|
4
|
+
“
|
5
|
+
”
|
6
|
+
《
|
7
|
+
》
|
8
|
+
!
|
9
|
+
,
|
10
|
+
:
|
11
|
+
;
|
12
|
+
?
|
13
|
+
啊
|
14
|
+
阿
|
15
|
+
哎
|
16
|
+
哎呀
|
17
|
+
哎哟
|
18
|
+
唉
|
19
|
+
俺
|
20
|
+
俺们
|
21
|
+
按
|
22
|
+
按照
|
23
|
+
吧
|
24
|
+
吧哒
|
25
|
+
把
|
26
|
+
罢了
|
27
|
+
被
|
28
|
+
本
|
29
|
+
本着
|
30
|
+
比
|
31
|
+
比方
|
32
|
+
比如
|
33
|
+
鄙人
|
34
|
+
彼
|
35
|
+
彼此
|
36
|
+
边
|
37
|
+
别
|
38
|
+
别的
|
39
|
+
别说
|
40
|
+
并
|
41
|
+
并且
|
42
|
+
不比
|
43
|
+
不成
|
44
|
+
不单
|
45
|
+
不但
|
46
|
+
不独
|
47
|
+
不管
|
48
|
+
不光
|
49
|
+
不过
|
50
|
+
不仅
|
51
|
+
不拘
|
52
|
+
不论
|
53
|
+
不怕
|
54
|
+
不然
|
55
|
+
不如
|
56
|
+
不特
|
57
|
+
不惟
|
58
|
+
不问
|
59
|
+
不只
|
60
|
+
朝
|
61
|
+
朝着
|
62
|
+
趁
|
63
|
+
趁着
|
64
|
+
乘
|
65
|
+
冲
|
66
|
+
除
|
67
|
+
除此之外
|
68
|
+
除非
|
69
|
+
除了
|
70
|
+
此
|
71
|
+
此间
|
72
|
+
此外
|
73
|
+
从
|
74
|
+
从而
|
75
|
+
打
|
76
|
+
待
|
77
|
+
但
|
78
|
+
但是
|
79
|
+
当
|
80
|
+
当着
|
81
|
+
到
|
82
|
+
得
|
83
|
+
的
|
84
|
+
的话
|
85
|
+
等
|
86
|
+
等等
|
87
|
+
地
|
88
|
+
第
|
89
|
+
叮咚
|
90
|
+
对
|
91
|
+
对于
|
92
|
+
多
|
93
|
+
多少
|
94
|
+
而
|
95
|
+
而况
|
96
|
+
而且
|
97
|
+
而是
|
98
|
+
而外
|
99
|
+
而言
|
100
|
+
而已
|
101
|
+
尔后
|
102
|
+
反过来
|
103
|
+
反过来说
|
104
|
+
反之
|
105
|
+
非但
|
106
|
+
非徒
|
107
|
+
否则
|
108
|
+
嘎
|
109
|
+
嘎登
|
110
|
+
该
|
111
|
+
赶
|
112
|
+
个
|
113
|
+
各
|
114
|
+
各个
|
115
|
+
各位
|
116
|
+
各种
|
117
|
+
各自
|
118
|
+
给
|
119
|
+
根据
|
120
|
+
跟
|
121
|
+
故
|
122
|
+
故此
|
123
|
+
固然
|
124
|
+
关于
|
125
|
+
管
|
126
|
+
归
|
127
|
+
果然
|
128
|
+
果真
|
129
|
+
过
|
130
|
+
哈
|
131
|
+
哈哈
|
132
|
+
呵
|
133
|
+
和
|
134
|
+
何
|
135
|
+
何处
|
136
|
+
何况
|
137
|
+
何时
|
138
|
+
嘿
|
139
|
+
哼
|
140
|
+
哼唷
|
141
|
+
呼哧
|
142
|
+
乎
|
143
|
+
哗
|
144
|
+
还是
|
145
|
+
还有
|
146
|
+
换句话说
|
147
|
+
换言之
|
148
|
+
或
|
149
|
+
或是
|
150
|
+
或者
|
151
|
+
极了
|
152
|
+
及
|
153
|
+
及其
|
154
|
+
及至
|
155
|
+
即
|
156
|
+
即便
|
157
|
+
即或
|
158
|
+
即令
|
159
|
+
即若
|
160
|
+
即使
|
161
|
+
几
|
162
|
+
几时
|
163
|
+
己
|
164
|
+
既
|
165
|
+
既然
|
166
|
+
既是
|
167
|
+
继而
|
168
|
+
加之
|
169
|
+
假如
|
170
|
+
假若
|
171
|
+
假使
|
172
|
+
鉴于
|
173
|
+
将
|
174
|
+
较
|
175
|
+
较之
|
176
|
+
叫
|
177
|
+
接着
|
178
|
+
结果
|
179
|
+
借
|
180
|
+
紧接着
|
181
|
+
进而
|
182
|
+
尽
|
183
|
+
尽管
|
184
|
+
经
|
185
|
+
经过
|
186
|
+
就
|
187
|
+
就是
|
188
|
+
就是说
|
189
|
+
据
|
190
|
+
具体地说
|
191
|
+
具体说来
|
192
|
+
开始
|
193
|
+
开外
|
194
|
+
靠
|
195
|
+
咳
|
196
|
+
可
|
197
|
+
可见
|
198
|
+
可是
|
199
|
+
可以
|
200
|
+
况且
|
201
|
+
啦
|
202
|
+
来
|
203
|
+
来着
|
204
|
+
离
|
205
|
+
例如
|
206
|
+
哩
|
207
|
+
连
|
208
|
+
连同
|
209
|
+
两者
|
210
|
+
了
|
211
|
+
临
|
212
|
+
另
|
213
|
+
另外
|
214
|
+
另一方面
|
215
|
+
论
|
216
|
+
嘛
|
217
|
+
吗
|
218
|
+
慢说
|
219
|
+
漫说
|
220
|
+
冒
|
221
|
+
么
|
222
|
+
每
|
223
|
+
每当
|
224
|
+
们
|
225
|
+
莫若
|
226
|
+
某
|
227
|
+
某个
|
228
|
+
某些
|
229
|
+
拿
|
230
|
+
哪
|
231
|
+
哪边
|
232
|
+
哪儿
|
233
|
+
哪个
|
234
|
+
哪里
|
235
|
+
哪年
|
236
|
+
哪怕
|
237
|
+
哪天
|
238
|
+
哪些
|
239
|
+
哪样
|
240
|
+
那
|
241
|
+
那边
|
242
|
+
那儿
|
243
|
+
那个
|
244
|
+
那会儿
|
245
|
+
那里
|
246
|
+
那么
|
247
|
+
那么些
|
248
|
+
那么样
|
249
|
+
那时
|
250
|
+
那些
|
251
|
+
那样
|
252
|
+
乃
|
253
|
+
乃至
|
254
|
+
呢
|
255
|
+
能
|
256
|
+
你
|
257
|
+
你们
|
258
|
+
您
|
259
|
+
宁
|
260
|
+
宁可
|
261
|
+
宁肯
|
262
|
+
宁愿
|
263
|
+
哦
|
264
|
+
呕
|
265
|
+
啪达
|
266
|
+
旁人
|
267
|
+
呸
|
268
|
+
凭
|
269
|
+
凭借
|
270
|
+
其
|
271
|
+
其次
|
272
|
+
其二
|
273
|
+
其他
|
274
|
+
其它
|
275
|
+
其一
|
276
|
+
其余
|
277
|
+
其中
|
278
|
+
起
|
279
|
+
起见
|
280
|
+
起见
|
281
|
+
岂但
|
282
|
+
恰恰相反
|
283
|
+
前后
|
284
|
+
前者
|
285
|
+
且
|
286
|
+
然而
|
287
|
+
然后
|
288
|
+
然则
|
289
|
+
让
|
290
|
+
人家
|
291
|
+
任
|
292
|
+
任何
|
293
|
+
任凭
|
294
|
+
如
|
295
|
+
如此
|
296
|
+
如果
|
297
|
+
如何
|
298
|
+
如其
|
299
|
+
如若
|
300
|
+
如上所述
|
301
|
+
若
|
302
|
+
若非
|
303
|
+
若是
|
304
|
+
啥
|
305
|
+
上下
|
306
|
+
尚且
|
307
|
+
设若
|
308
|
+
设使
|
309
|
+
甚而
|
310
|
+
甚么
|
311
|
+
甚至
|
312
|
+
省得
|
313
|
+
时候
|
314
|
+
什么
|
315
|
+
什么样
|
316
|
+
使得
|
317
|
+
是
|
318
|
+
是的
|
319
|
+
首先
|
320
|
+
谁
|
321
|
+
谁知
|
322
|
+
顺
|
323
|
+
顺着
|
324
|
+
似的
|
325
|
+
虽
|
326
|
+
虽然
|
327
|
+
虽说
|
328
|
+
虽则
|
329
|
+
随
|
330
|
+
随着
|
331
|
+
所
|
332
|
+
所以
|
333
|
+
他
|
334
|
+
他们
|
335
|
+
他人
|
336
|
+
它
|
337
|
+
它们
|
338
|
+
她
|
339
|
+
她们
|
340
|
+
倘
|
341
|
+
倘或
|
342
|
+
倘然
|
343
|
+
倘若
|
344
|
+
倘使
|
345
|
+
腾
|
346
|
+
替
|
347
|
+
通过
|
348
|
+
同
|
349
|
+
同时
|
350
|
+
哇
|
351
|
+
万一
|
352
|
+
往
|
353
|
+
望
|
354
|
+
为
|
355
|
+
为何
|
356
|
+
为了
|
357
|
+
为什么
|
358
|
+
为着
|
359
|
+
喂
|
360
|
+
嗡嗡
|
361
|
+
我
|
362
|
+
我们
|
363
|
+
呜
|
364
|
+
呜呼
|
365
|
+
乌乎
|
366
|
+
无论
|
367
|
+
无宁
|
368
|
+
毋宁
|
369
|
+
嘻
|
370
|
+
吓
|
371
|
+
相对而言
|
372
|
+
像
|
373
|
+
向
|
374
|
+
向着
|
375
|
+
嘘
|
376
|
+
呀
|
377
|
+
焉
|
378
|
+
沿
|
379
|
+
沿着
|
380
|
+
要
|
381
|
+
要不
|
382
|
+
要不然
|
383
|
+
要不是
|
384
|
+
要么
|
385
|
+
要是
|
386
|
+
也
|
387
|
+
也罢
|
388
|
+
也好
|
389
|
+
一
|
390
|
+
一般
|
391
|
+
一旦
|
392
|
+
一方面
|
393
|
+
一来
|
394
|
+
一切
|
395
|
+
一样
|
396
|
+
一则
|
397
|
+
依
|
398
|
+
依照
|
399
|
+
矣
|
400
|
+
以
|
401
|
+
以便
|
402
|
+
以及
|
403
|
+
以免
|
404
|
+
以至
|
405
|
+
以至于
|
406
|
+
以致
|
407
|
+
抑或
|
408
|
+
因
|
409
|
+
因此
|
410
|
+
因而
|
411
|
+
因为
|
412
|
+
哟
|
413
|
+
用
|
414
|
+
由
|
415
|
+
由此可见
|
416
|
+
由于
|
417
|
+
有
|
418
|
+
有的
|
419
|
+
有关
|
420
|
+
有些
|
421
|
+
又
|
422
|
+
于
|
423
|
+
于是
|
424
|
+
于是乎
|
425
|
+
与
|
426
|
+
与此同时
|
427
|
+
与否
|
428
|
+
与其
|
429
|
+
越是
|
430
|
+
云云
|
431
|
+
哉
|
432
|
+
再说
|
433
|
+
再者
|
434
|
+
在
|
435
|
+
在下
|
436
|
+
咱
|
437
|
+
咱们
|
438
|
+
则
|
439
|
+
怎
|
440
|
+
怎么
|
441
|
+
怎么办
|
442
|
+
怎么样
|
443
|
+
怎样
|
444
|
+
咋
|
445
|
+
照
|
446
|
+
照着
|
447
|
+
者
|
448
|
+
这
|
449
|
+
这边
|
450
|
+
这儿
|
451
|
+
这个
|
452
|
+
这会儿
|
453
|
+
这就是说
|
454
|
+
这里
|
455
|
+
这么
|
456
|
+
这么点儿
|
457
|
+
这么些
|
458
|
+
这么样
|
459
|
+
这时
|
460
|
+
这些
|
461
|
+
这样
|
462
|
+
正如
|
463
|
+
吱
|
464
|
+
之
|
465
|
+
之类
|
466
|
+
之所以
|
467
|
+
之一
|
468
|
+
只是
|
469
|
+
只限
|
470
|
+
只要
|
471
|
+
只有
|
472
|
+
至
|
473
|
+
至于
|
474
|
+
诸位
|
475
|
+
着
|
476
|
+
着呢
|
477
|
+
自
|
478
|
+
自从
|
479
|
+
自个儿
|
480
|
+
自各儿
|
481
|
+
自己
|
482
|
+
自家
|
483
|
+
自身
|
484
|
+
综上所述
|
485
|
+
总的来看
|
486
|
+
总的来说
|
487
|
+
总的说来
|
488
|
+
总而言之
|
489
|
+
总之
|
490
|
+
纵
|
491
|
+
纵令
|
492
|
+
纵然
|
493
|
+
纵使
|
494
|
+
遵照
|
495
|
+
作为
|
496
|
+
兮
|
497
|
+
呃
|
498
|
+
呗
|
499
|
+
咚
|
500
|
+
咦
|
501
|
+
喏
|
502
|
+
啐
|
503
|
+
喔唷
|
504
|
+
嗬
|
505
|
+
嗯
|
506
|
+
嗳
|
@@ -0,0 +1,6 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module Simhash
|
3
|
+
module Stopwords
|
4
|
+
EN = " a able about above abst accordance according accordingly across act actually added adj adopted affected affecting affects after afterwards again against ah all almost alone along already also although always am among amongst an and announce another any anybody anyhow anymore anyone anything anyway anyways anywhere apparently approximately are aren arent arise around as aside ask asking at auth available away awfully b back be became because become becomes becoming been before beforehand begin beginning beginnings begins behind being believe below beside besides between beyond biol both brief briefly but by c ca came can cannot can't cause causes certain certainly co com come comes contain containing contains could couldnt d date did didn't different do does doesn't doing done don't down downwards due during e each ed edu effect eg eight eighty either else elsewhere end ending enough especially et et-al etc even ever every everybody everyone everything everywhere ex except f far few ff fifth first five fix followed following follows for former formerly forth found four from further furthermore g gave get gets getting give given gives giving go goes gone got gotten h had happens hardly has hasn't have haven't having he hed hence her here hereafter hereby herein heres hereupon hers herself hes hi hid him himself his hither home how howbeit however hundred i id ie if i'll im immediate immediately importance important in inc indeed index information instead into invention inward is isn't it itd it'll its itself i've j just k keep keeps kept keys kg km know known knows l largely last lately later latter latterly least less lest let lets like liked likely line little 'll look looking looks ltd m made mainly make makes many may maybe me mean means meantime meanwhile merely mg might million miss ml more moreover most mostly mr mrs much mug must my myself n na name namely nay nd near nearly necessarily necessary need needs neither never nevertheless new next nine ninety no nobody non none nonetheless noone nor normally nos not noted nothing now nowhere o obtain obtained obviously of off often oh ok okay old omitted on once one ones only onto or ord other others otherwise ought our ours ourselves out outside over overall owing own p page pages part particular particularly past per perhaps placed please plus poorly possible possibly potentially pp predominantly present previously primarily probably promptly proud provides put q que quickly quite qv r ran rather rd re readily really recent recently ref refs regarding regardless regards related relatively research respectively resulted resulting results right run s said same saw say saying says sec section see seeing seem seemed seeming seems seen self selves sent seven several shall she shed she'll shes should shouldn't show showed shown showns shows significant significantly similar similarly since six slightly so some somebody somehow someone somethan something sometime sometimes somewhat somewhere soon sorry specifically specified specify specifying state states still stop strongly sub substantially successfully such sufficiently suggest sup sure t take taken taking tell tends th than thank thanks thanx that that'll thats that've the their theirs them themselves then thence there thereafter thereby thered therefore therein there'll thereof therere theres thereto thereupon there've these they theyd they'll theyre they've think this those thou though thoughh thousand throug through throughout thru thus til tip to together too took toward towards tried tries truly try trying ts twice two u un under unfortunately unless unlike unlikely until unto up upon ups us use used useful usefully usefulness uses using usually v value various 've very via viz vol vols vs w want wants was wasn't way we wed welcome we'll went were weren't we've what whatever what'll whats when whence whenever where whereafter whereas whereby wherein wheres whereupon wherever whether which while whim whither who whod whoever whole who'll whom whomever whos whose why widely willing wish with within without won't words world would wouldn't www x y yes yet you youd you'll your youre yours yourself yourselves you've z zero "
|
5
|
+
end
|
6
|
+
end
|
@@ -0,0 +1,6 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
module Simhash
|
3
|
+
module Stopwords
|
4
|
+
RU = " а е и ж м о на не ни об но он мне мои мож она они оно мной много многочисленное многочисленная многочисленные многочисленный мною мой мог могут можно может можхо мор моя моё мочь над нее оба нам нем нами ними мимо немного одной одного менее однажды однако меня нему меньше ней наверху него ниже мало надо один одиннадцать одиннадцатый назад наиболее недавно миллионов недалеко между низко меля нельзя нибудь непрерывно наконец никогда никуда нас наш нет нею неё них мира наша наше наши ничего начала нередко несколько обычно опять около мы ну нх от отовсюду особенно нужно очень отсюда в во вон вниз внизу вокруг вот восемнадцать восемнадцатый восемь восьмой вверх вам вами важное важная важные важный вдали везде ведь вас ваш ваша ваше ваши впрочем весь вдруг вы все второй всем всеми времени время всему всего всегда всех всею всю вся всё всюду г год говорил говорит года году где да ее за из ли же им до по ими под иногда довольно именно долго позже более должно пожалуйста значит иметь больше пока ему имя пор пора потом потому после почему почти посреди ей два две двенадцать двенадцатый двадцать двадцатый двух его дел или без день занят занята занято заняты действительно давно девятнадцать девятнадцатый девять девятый даже алло жизнь далеко близко здесь дальше для лет зато даром первый перед затем зачем лишь десять десятый ею её их бы еще при был про процентов против просто бывает бывь если люди была были было будем будет будете будешь прекрасно буду будь будто будут ещё пятнадцать пятнадцатый друго другое другой другие другая других есть пять быть лучше пятый к ком конечно кому кого когда которой которого которая которые который которых кем каждое каждая каждые каждый кажется как какой какая кто кроме куда кругом с т у я та те уж со то том снова тому совсем того тогда тоже собой тобой собою тобою сначала только уметь тот тою хорошо хотеть хочешь хоть хотя свое свои твой своей своего своих свою твоя твоё раз уже сам там тем чем сама сами теми само рано самом самому самой самого семнадцать семнадцатый самим самими самих саму семь чему раньше сейчас чего сегодня себе тебе сеаой человек разве теперь себя тебя седьмой спасибо слишком так такое такой такие также такая сих тех чаще четвертый через часто шестой шестнадцать шестнадцатый шесть четыре четырнадцать четырнадцатый сколько сказал сказала сказать ту ты три эта эти что это чтоб этом этому этой этого чтобы этот стал туда этим этими рядом тринадцать тринадцатый этих третий тут эту суть чуть тысяч "
|
5
|
+
end
|
6
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), "stopwords", "en")
|
3
|
+
require File.join(File.dirname(__FILE__), "stopwords", "ru")
|
4
|
+
require File.join(File.dirname(__FILE__), "stopwords", "zh")
|
5
|
+
|
6
|
+
module Simhash
|
7
|
+
module Stopwords
|
8
|
+
ALL = RU + EN + ZH
|
9
|
+
end
|
10
|
+
end
|
data/lib/simhash.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
require 'active_support/core_ext/string/multibyte'
|
4
|
+
require 'unicode'
|
5
|
+
|
6
|
+
require 'string'
|
7
|
+
require 'integer'
|
8
|
+
require 'simhash/stopwords'
|
9
|
+
|
10
|
+
begin
|
11
|
+
require 'string_hashing'
|
12
|
+
rescue LoadError
|
13
|
+
end
|
14
|
+
|
15
|
+
module Simhash
|
16
|
+
DEFAULT_STRING_HASH_METHOD = String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
|
17
|
+
PUNCTUATION_REGEXP = if RUBY_VERSION >= "1.9"
|
18
|
+
/(\s|\d|[^\p{L}]|\302\240| *— *|[«»…\-–—]| )+/u
|
19
|
+
else
|
20
|
+
/(\s|\d|\W|\302\240| *— *|[«»…\-–—]| )+/u
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
def self.hash(tokens, options={})
|
25
|
+
hashbits = options[:hashbits] || 64
|
26
|
+
hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
|
27
|
+
|
28
|
+
v = [0] * hashbits
|
29
|
+
masks = v.dup
|
30
|
+
masks.each_with_index {|e, i| masks[i] = (1 << i)}
|
31
|
+
|
32
|
+
self.each_filtered_token(tokens, options) do |token|
|
33
|
+
hashed_token = token.send(hashing_method, hashbits).to_i
|
34
|
+
hashbits.times do |i|
|
35
|
+
v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
fingerprint = 0
|
40
|
+
|
41
|
+
hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }
|
42
|
+
|
43
|
+
fingerprint
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.each_filtered_token(tokens, options={})
|
47
|
+
token_min_size = options[:token_min_size].to_i
|
48
|
+
stop_sentenses = options[:stop_sentenses]
|
49
|
+
tokens.each do |token|
|
50
|
+
# cutting punctuation (\302\240 is unbreakable space)
|
51
|
+
token = token.force_encoding('UTF-8').gsub(PUNCTUATION_REGEXP, ' ') if !options[:preserve_punctuation]
|
52
|
+
|
53
|
+
token = Unicode::downcase(token.strip)
|
54
|
+
|
55
|
+
# cutting stop-words
|
56
|
+
token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
|
57
|
+
|
58
|
+
# cutting stop-sentenses
|
59
|
+
next if stop_sentenses && stop_sentenses.include?(" #{token} ")
|
60
|
+
|
61
|
+
next if token.size.zero? || token.mb_chars.size < token_min_size
|
62
|
+
|
63
|
+
yield token
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.filtered_tokens(tokens, options={})
|
68
|
+
filtered_tokens = []
|
69
|
+
self.each_filtered_token(tokens, options) { |token| filtered_tokens << token }
|
70
|
+
filtered_tokens
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.hm
|
74
|
+
@@string_hash_method
|
75
|
+
end
|
76
|
+
end
|
data/lib/string.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
require 'rmmseg'
|
4
|
+
require 'tyccl'
|
5
|
+
require 'debugger'
|
6
|
+
|
7
|
+
class String
|
8
|
+
include RMMSeg
|
9
|
+
RMMSeg::Dictionary.load_dictionaries
|
10
|
+
|
11
|
+
def simhash(options={})
|
12
|
+
Simhash.hash(self.segment, options)
|
13
|
+
end
|
14
|
+
|
15
|
+
def hash_vl_rb(length)
|
16
|
+
return 0 if self == ""
|
17
|
+
|
18
|
+
x = self.bytes.first << 7
|
19
|
+
m = 1000003
|
20
|
+
mask = (1<<length) - 1
|
21
|
+
self.each_byte{ |char| x = ((x * m) ^ char.to_i) & mask }
|
22
|
+
|
23
|
+
x ^= self.bytes.count
|
24
|
+
x = -2 if x == -1
|
25
|
+
x
|
26
|
+
end
|
27
|
+
|
28
|
+
def segment
|
29
|
+
algor = RMMSeg::Algorithm.new(self)
|
30
|
+
result = []
|
31
|
+
loop do
|
32
|
+
tok = algor.next_token
|
33
|
+
word = (Tyccl.get_similar(tok.text.force_encoding('utf-8')).first.first rescue nil)
|
34
|
+
word = tok.text if word.nil? && !tok.nil?
|
35
|
+
break if word.nil?
|
36
|
+
result << word
|
37
|
+
end
|
38
|
+
result
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
metadata
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: simhash_tim_modified
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.6
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Alex Gusev
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-09-02 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rmmseg-cpp-huacnlee
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.2.9
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.2.9
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: debugger
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: tyccl
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 0.0.2
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.0.2
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: algorithms
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: unicode
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: 0.3.1
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 0.3.1
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: activesupport
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
description: Implementation of Charikar simhashes in Ruby
|
111
|
+
email: alex.gusev@bookmate.ru
|
112
|
+
executables: []
|
113
|
+
extensions:
|
114
|
+
- ext/string_hashing/extconf.rb
|
115
|
+
extra_rdoc_files: []
|
116
|
+
files:
|
117
|
+
- README.rdoc
|
118
|
+
- LICENSE
|
119
|
+
- Rakefile
|
120
|
+
- lib/integer.rb
|
121
|
+
- lib/simhash/stopwords/chinese_stopword.txt
|
122
|
+
- lib/simhash/stopwords/en.rb
|
123
|
+
- lib/simhash/stopwords/ru.rb
|
124
|
+
- lib/simhash/stopwords/zh.rb
|
125
|
+
- lib/simhash/stopwords.rb
|
126
|
+
- lib/simhash.rb
|
127
|
+
- lib/string.rb
|
128
|
+
- ext/string_hashing/extconf.rb
|
129
|
+
- ext/string_hashing/string_hashing.c
|
130
|
+
homepage: http://github.com/timlang/simhash
|
131
|
+
licenses: []
|
132
|
+
post_install_message:
|
133
|
+
rdoc_options: []
|
134
|
+
require_paths:
|
135
|
+
- lib
|
136
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
|
+
none: false
|
144
|
+
requirements:
|
145
|
+
- - ! '>='
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '0'
|
148
|
+
requirements: []
|
149
|
+
rubyforge_project: simhash
|
150
|
+
rubygems_version: 1.8.25
|
151
|
+
signing_key:
|
152
|
+
specification_version: 3
|
153
|
+
summary: ! 'Gives you possbility to convert string into simhashes to futher use: finding
|
154
|
+
near-duplicates, similar strings, etc.'
|
155
|
+
test_files: []
|
156
|
+
has_rdoc:
|