iudex-simhash 1.0.0-java → 1.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.rdoc +4 -0
- data/Manifest.txt +1 -1
- data/Rakefile +3 -3
- data/lib/iudex-simhash/base.rb +1 -1
- data/lib/iudex-simhash/factory_helper.rb +7 -4
- data/lib/iudex-simhash/{iudex-simhash-1.0.0.jar → iudex-simhash-1.1.0.jar} +0 -0
- data/pom.xml +3 -3
- data/test/test_fuzzy_set.rb +17 -17
- metadata +9 -13
data/.gemtest
ADDED
File without changes
|
data/History.rdoc
CHANGED
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH << './lib'
|
|
4
4
|
require 'iudex-simhash/base'
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
|
-
gem 'rjack-tarpit', '~> 1.
|
7
|
+
gem 'rjack-tarpit', '~> 1.4'
|
8
8
|
require 'rjack-tarpit'
|
9
9
|
|
10
10
|
t = RJack::TarPit.new( 'iudex-simhash',
|
@@ -13,10 +13,10 @@ t = RJack::TarPit.new( 'iudex-simhash',
|
|
13
13
|
|
14
14
|
t.specify do |h|
|
15
15
|
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
16
|
-
h.extra_deps += [ [ 'iudex-html', '~> 1.
|
16
|
+
h.extra_deps += [ [ 'iudex-html', '~> 1.1.0' ] ]
|
17
17
|
|
18
18
|
h.testlib = :minitest
|
19
|
-
h.extra_dev_deps += [ [ 'minitest', '
|
19
|
+
h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
|
20
20
|
[ 'rjack-logback', '~> 1.0' ] ]
|
21
21
|
end
|
22
22
|
|
data/lib/iudex-simhash/base.rb
CHANGED
@@ -28,9 +28,10 @@ module Iudex
|
|
28
28
|
'config', 'stopwords.en' )
|
29
29
|
|
30
30
|
def simhash_stopwords( wfile = DEFAULT_WORDS )
|
31
|
-
words =
|
32
|
-
|
33
|
-
|
31
|
+
words =
|
32
|
+
File.open( wfile ) { |fin| fin.readlines }.
|
33
|
+
map { |w| w.strip }.
|
34
|
+
reject { |w| w =~ /^#/ }
|
34
35
|
|
35
36
|
Gen::StopWordSet.new( words )
|
36
37
|
end
|
@@ -40,7 +41,9 @@ module Iudex
|
|
40
41
|
def simhash_generator( input = :simhash_generator_inputs,
|
41
42
|
stopwords = simhash_stopwords )
|
42
43
|
|
43
|
-
inputs = send( input ).
|
44
|
+
inputs = send( input ).
|
45
|
+
map { |r| Array( r ) }.
|
46
|
+
map do | key, ratio |
|
44
47
|
key = key.to_k
|
45
48
|
i = if( key.value_type == Element.java_class )
|
46
49
|
SimHashGenerator::Input.forTree( key )
|
Binary file
|
data/pom.xml
CHANGED
@@ -3,13 +3,13 @@
|
|
3
3
|
<groupId>iudex</groupId>
|
4
4
|
<artifactId>iudex-simhash</artifactId>
|
5
5
|
<packaging>jar</packaging>
|
6
|
-
<version>1.
|
6
|
+
<version>1.1.0</version>
|
7
7
|
<name>Iudex simhash production and searching</name>
|
8
8
|
|
9
9
|
<parent>
|
10
10
|
<groupId>iudex</groupId>
|
11
11
|
<artifactId>iudex-parent</artifactId>
|
12
|
-
<version>1.
|
12
|
+
<version>1.1</version>
|
13
13
|
<relativePath>..</relativePath>
|
14
14
|
</parent>
|
15
15
|
|
@@ -18,7 +18,7 @@
|
|
18
18
|
<dependency>
|
19
19
|
<groupId>iudex</groupId>
|
20
20
|
<artifactId>iudex-html</artifactId>
|
21
|
-
<version>[1.
|
21
|
+
<version>[1.1,1.2)</version>
|
22
22
|
</dependency>
|
23
23
|
|
24
24
|
<dependency>
|
data/test/test_fuzzy_set.rb
CHANGED
@@ -51,26 +51,26 @@ class TestFuzzySet < MiniTest::Unit::TestCase
|
|
51
51
|
|
52
52
|
def test_match
|
53
53
|
m = FuzzyList64.new( 100, 4 )
|
54
|
-
assert(
|
55
|
-
assert(
|
56
|
-
|
54
|
+
assert( m.fuzzy_match( 0, 0 ) )
|
55
|
+
assert( m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
|
56
|
+
hex( '7FFF_FFFF_7777_FFFF' ) ) )
|
57
57
|
|
58
|
-
assert(
|
59
|
-
|
58
|
+
assert( m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
|
59
|
+
hex( 'FFFF_FFFF_7777_FFFF' ) ) )
|
60
60
|
|
61
|
-
|
62
|
-
|
61
|
+
refute( m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
|
62
|
+
hex( '7FFF_FFFF_EFFF_7777' ) ) )
|
63
63
|
|
64
|
-
|
65
|
-
|
64
|
+
refute( m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
|
65
|
+
hex( 'FFFF_FFFF_EFFF_7777' ) ) )
|
66
66
|
end
|
67
67
|
|
68
68
|
def test_add
|
69
69
|
m = FuzzyList64.new( 100, 4 )
|
70
|
-
assert(
|
71
|
-
assert(
|
72
|
-
|
73
|
-
|
70
|
+
assert( m.addIfNotFound( 0x0 ) )
|
71
|
+
assert( m.addIfNotFound( 0xFF ) )
|
72
|
+
refute( m.addIfNotFound( 0xFE ) )
|
73
|
+
refute( m.addIfNotFound( 0x1 ) )
|
74
74
|
end
|
75
75
|
|
76
76
|
def test_series_list
|
@@ -92,7 +92,7 @@ class TestFuzzySet < MiniTest::Unit::TestCase
|
|
92
92
|
s = s.dup
|
93
93
|
last = s.pop # Remove last for now
|
94
94
|
assert_series_all( fset, s )
|
95
|
-
|
95
|
+
refute( fset.addIfNotFound( hex( last ) ), last )
|
96
96
|
end
|
97
97
|
|
98
98
|
def assert_series_all( fset, s )
|
@@ -119,21 +119,21 @@ class TestFuzzySet < MiniTest::Unit::TestCase
|
|
119
119
|
last = s.pop # Remove last for now
|
120
120
|
assert_find_series_all( fset, s )
|
121
121
|
l = Java::java.util.ArrayList.new;
|
122
|
-
|
122
|
+
refute( fset.addFindAll( hex( last ), l ) )
|
123
123
|
assert( l.size(), 1 );
|
124
124
|
|
125
125
|
# Remove the match and try again.
|
126
126
|
assert( fset.remove( l.get( 0 ) ), "remove match" )
|
127
127
|
assert( fset.remove( hex( last ) ), "remove last" )
|
128
128
|
l.clear
|
129
|
-
|
129
|
+
refute( fset.addFindAll( hex( last ), l ) )
|
130
130
|
assert( l.empty? )
|
131
131
|
end
|
132
132
|
|
133
133
|
def assert_find_series_all( fset, s )
|
134
134
|
s.each do |k|
|
135
135
|
l = Java::java.util.ArrayList.new;
|
136
|
-
|
136
|
+
refute( fset.addFindAll( hex( k ), l ) )
|
137
137
|
assert( l.empty? )
|
138
138
|
end
|
139
139
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: iudex-simhash
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.
|
5
|
+
version: 1.1.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
@@ -10,8 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
14
|
-
default_executable:
|
13
|
+
date: 2011-11-13 00:00:00 Z
|
15
14
|
dependencies:
|
16
15
|
- !ruby/object:Gem::Dependency
|
17
16
|
name: iudex-html
|
@@ -21,7 +20,7 @@ dependencies:
|
|
21
20
|
requirements:
|
22
21
|
- - ~>
|
23
22
|
- !ruby/object:Gem::Version
|
24
|
-
version: 1.
|
23
|
+
version: 1.1.0
|
25
24
|
type: :runtime
|
26
25
|
version_requirements: *id001
|
27
26
|
- !ruby/object:Gem::Dependency
|
@@ -30,12 +29,9 @@ dependencies:
|
|
30
29
|
requirement: &id002 !ruby/object:Gem::Requirement
|
31
30
|
none: false
|
32
31
|
requirements:
|
33
|
-
- -
|
34
|
-
- !ruby/object:Gem::Version
|
35
|
-
version: 1.7.1
|
36
|
-
- - <
|
32
|
+
- - ~>
|
37
33
|
- !ruby/object:Gem::Version
|
38
|
-
version: "2.
|
34
|
+
version: "2.3"
|
39
35
|
type: :development
|
40
36
|
version_requirements: *id002
|
41
37
|
- !ruby/object:Gem::Dependency
|
@@ -57,7 +53,7 @@ dependencies:
|
|
57
53
|
requirements:
|
58
54
|
- - ~>
|
59
55
|
- !ruby/object:Gem::Version
|
60
|
-
version: 1.
|
56
|
+
version: 1.4.0
|
61
57
|
type: :development
|
62
58
|
version_requirements: *id004
|
63
59
|
description: |-
|
@@ -90,8 +86,8 @@ files:
|
|
90
86
|
- test/test_fuzzy_set.rb
|
91
87
|
- test/test_simhash_generator.rb
|
92
88
|
- test/html/gentest.html
|
93
|
-
- lib/iudex-simhash/iudex-simhash-1.
|
94
|
-
|
89
|
+
- lib/iudex-simhash/iudex-simhash-1.1.0.jar
|
90
|
+
- .gemtest
|
95
91
|
homepage: http://github.com/dekellum/iudex
|
96
92
|
licenses: []
|
97
93
|
|
@@ -116,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
112
|
requirements: []
|
117
113
|
|
118
114
|
rubyforge_project: iudex-simhash
|
119
|
-
rubygems_version: 1.
|
115
|
+
rubygems_version: 1.8.9
|
120
116
|
signing_key:
|
121
117
|
specification_version: 3
|
122
118
|
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|