iudex-simhash 1.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +2 -0
- data/Manifest.txt +16 -0
- data/README.rdoc +33 -0
- data/Rakefile +39 -0
- data/bin/iudex-simhash-perftest +104 -0
- data/config/stopwords.en +35 -0
- data/lib/iudex-simhash.rb +44 -0
- data/lib/iudex-simhash/base.rb +21 -0
- data/lib/iudex-simhash/factory_helper.rb +65 -0
- data/lib/iudex-simhash/iudex-simhash-1.0.0.jar +0 -0
- data/lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb +80 -0
- data/pom.xml +44 -0
- data/test/html/gentest.html +1447 -0
- data/test/setup.rb +35 -0
- data/test/test_fuzzy_set.rb +144 -0
- data/test/test_simhash_generator.rb +103 -0
- metadata +125 -0
data/test/setup.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
#### General test setup: LOAD_PATH, logging, console output ####
|
18
|
+
|
19
|
+
ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
|
20
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
21
|
+
|
22
|
+
require 'rubygems'
|
23
|
+
|
24
|
+
require 'rjack-logback'
|
25
|
+
RJack::Logback.config_console( :stderr => true )
|
26
|
+
|
27
|
+
require 'minitest/unit'
|
28
|
+
require 'minitest/autorun'
|
29
|
+
|
30
|
+
# Make test output logging compatible: no partial lines.
|
31
|
+
# class TestOut
|
32
|
+
# def print( *a ); $stdout.puts( *a ); end
|
33
|
+
# def puts( *a ); $stdout.puts( *a ); end
|
34
|
+
# end
|
35
|
+
# MiniTest::Unit.output = TestOut.new
|
@@ -0,0 +1,144 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2010-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You may
|
9
|
+
# obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
require 'iudex-simhash'
|
23
|
+
|
24
|
+
class TestFuzzySet < MiniTest::Unit::TestCase
|
25
|
+
include Iudex::SimHash::BruteFuzzy
|
26
|
+
|
27
|
+
# Series that will allow all but last at 3 bit threshold, all at 2
|
28
|
+
# bit threshold.
|
29
|
+
TEST_SERIES = [ %w[ FFFF_FFFF_FFFF_FFFF
|
30
|
+
7FFF_7FFF_7FFF_7FFF
|
31
|
+
F7FF_F7FF_F7FF_F7FF
|
32
|
+
FF7F_FF7F_FF7F_FFFF ],
|
33
|
+
|
34
|
+
%w[ 0000_0000_0000_0000
|
35
|
+
0100_1000_1000_0010
|
36
|
+
1000_0100_0100_1000
|
37
|
+
0010_0010_0010_0001
|
38
|
+
0001_0001_0001_0000 ],
|
39
|
+
|
40
|
+
%w[ 0000_0000_0000_0000
|
41
|
+
0010_0100_0100_0100
|
42
|
+
0001_0000_1000_1001
|
43
|
+
0100_1001_0001_0000
|
44
|
+
0000_0010_0010_0010 ] ]
|
45
|
+
|
46
|
+
def test_hex
|
47
|
+
assert_equal( 0x1000_0000, hex( "1000_0000" ) )
|
48
|
+
assert_equal( 0x7FFF_FFFF_FFFF_FFFF, hex( "7FFF_FFFF_FFFF_FFFF" ) )
|
49
|
+
assert_equal( -1, hex( "FFFF_FFFF_FFFF_FFFF" ) )
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_match
|
53
|
+
m = FuzzyList64.new( 100, 4 )
|
54
|
+
assert( m.fuzzy_match( 0, 0 ) )
|
55
|
+
assert( m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
|
56
|
+
hex( '7FFF_FFFF_7777_FFFF' ) ) )
|
57
|
+
|
58
|
+
assert( m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
|
59
|
+
hex( 'FFFF_FFFF_7777_FFFF' ) ) )
|
60
|
+
|
61
|
+
assert( ! m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
|
62
|
+
hex( '7FFF_FFFF_EFFF_7777' ) ) )
|
63
|
+
|
64
|
+
assert( ! m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
|
65
|
+
hex( 'FFFF_FFFF_EFFF_7777' ) ) )
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_add
|
69
|
+
m = FuzzyList64.new( 100, 4 )
|
70
|
+
assert( m.addIfNotFound( 0x0 ) )
|
71
|
+
assert( m.addIfNotFound( 0xFF ) )
|
72
|
+
assert( ! m.addIfNotFound( 0xFE ) )
|
73
|
+
assert( ! m.addIfNotFound( 0x1 ) )
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_series_list
|
77
|
+
assert_series( FuzzyList64 )
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_series_tree
|
81
|
+
assert_series( FuzzyTree64 )
|
82
|
+
end
|
83
|
+
|
84
|
+
def assert_series( fclz )
|
85
|
+
TEST_SERIES.each do |s|
|
86
|
+
assert_series_last( fclz.new( 5, 3 ), s )
|
87
|
+
assert_series_all( fclz.new( 5, 2 ), s )
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def assert_series_last( fset, s )
|
92
|
+
s = s.dup
|
93
|
+
last = s.pop # Remove last for now
|
94
|
+
assert_series_all( fset, s )
|
95
|
+
assert( ! fset.addIfNotFound( hex( last ) ), last )
|
96
|
+
end
|
97
|
+
|
98
|
+
def assert_series_all( fset, s )
|
99
|
+
s.each { |k| assert( fset.addIfNotFound( hex( k ) ), k ) }
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_find_series_list
|
103
|
+
assert_find_series( FuzzyList64 )
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_find_series_tree
|
107
|
+
assert_find_series( FuzzyTree64 )
|
108
|
+
end
|
109
|
+
|
110
|
+
def assert_find_series( fclz )
|
111
|
+
TEST_SERIES.each do |s|
|
112
|
+
assert_find_series_last( fclz.new( 5, 3 ), s )
|
113
|
+
assert_find_series_all( fclz.new( 5, 2 ), s )
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def assert_find_series_last( fset, s )
|
118
|
+
s = s.dup
|
119
|
+
last = s.pop # Remove last for now
|
120
|
+
assert_find_series_all( fset, s )
|
121
|
+
l = Java::java.util.ArrayList.new;
|
122
|
+
assert( ! fset.addFindAll( hex( last ), l ) )
|
123
|
+
assert( l.size(), 1 );
|
124
|
+
|
125
|
+
# Remove the match and try again.
|
126
|
+
assert( fset.remove( l.get( 0 ) ), "remove match" )
|
127
|
+
assert( fset.remove( hex( last ) ), "remove last" )
|
128
|
+
l.clear
|
129
|
+
assert( ! fset.addFindAll( hex( last ), l ) )
|
130
|
+
assert( l.empty? )
|
131
|
+
end
|
132
|
+
|
133
|
+
def assert_find_series_all( fset, s )
|
134
|
+
s.each do |k|
|
135
|
+
l = Java::java.util.ArrayList.new;
|
136
|
+
assert( ! fset.addFindAll( hex( k ), l ) )
|
137
|
+
assert( l.empty? )
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def hex( h )
|
142
|
+
BruteFuzzy::unsignedHexToLong( h.gsub( /_/, '' ) )
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2010-2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
require 'iudex-simhash'
|
23
|
+
require 'iudex-simhash/factory_helper'
|
24
|
+
|
25
|
+
class TestSimhashGenerator < MiniTest::Unit::TestCase
|
26
|
+
include Gravitext::HTMap
|
27
|
+
include Iudex::Core
|
28
|
+
include Iudex::Core::Filters
|
29
|
+
include Iudex::HTML
|
30
|
+
include Iudex::HTML::Filters
|
31
|
+
include Iudex::HTML::Tree
|
32
|
+
include Iudex::HTML::Tree::Filters
|
33
|
+
include Iudex::Filter::Core
|
34
|
+
include Iudex::SimHash::Filters
|
35
|
+
|
36
|
+
include Iudex::SimHash::Filters::FactoryHelper
|
37
|
+
|
38
|
+
import 'iudex.html.HTMLUtils'
|
39
|
+
|
40
|
+
UniMap.define_accessors
|
41
|
+
|
42
|
+
Order = HTMLTreeFilter::Order
|
43
|
+
|
44
|
+
def test_default_stopwords
|
45
|
+
stopwords = simhash_stopwords
|
46
|
+
assert( stopwords.contains( 'from' ) )
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_generate
|
50
|
+
html = <<HTML
|
51
|
+
<html>
|
52
|
+
<head>
|
53
|
+
<title>Title</title>
|
54
|
+
</head>
|
55
|
+
<body>
|
56
|
+
<p>We are talking about the same thing here.</p>
|
57
|
+
<p>Really this is the same exact thing I was telling you last time.</p>
|
58
|
+
<p>cruft</p> <!-- Ignored by default 0.3 wordy ratio -->
|
59
|
+
</body>
|
60
|
+
</html>
|
61
|
+
HTML
|
62
|
+
|
63
|
+
map = content( html )
|
64
|
+
assert( filter_chain.filter( map ) )
|
65
|
+
assert_equal( 'Title', map.title.to_s )
|
66
|
+
assert_equal( 'eaa4172924c0bf6e', hex( map.simhash ) )
|
67
|
+
|
68
|
+
html.gsub!( /the/, "\t" ) # Removing stop words doesn't matter
|
69
|
+
html.gsub!( /cruft/, "xcruft" ) # cruft by any other name...
|
70
|
+
map = content( html )
|
71
|
+
assert( filter_chain.filter( map ) )
|
72
|
+
assert_equal( 'eaa4172924c0bf6e', hex( map.simhash ) )
|
73
|
+
end
|
74
|
+
|
75
|
+
def content( html, charset = "UTF-8" )
|
76
|
+
map = UniMap.new
|
77
|
+
map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
|
78
|
+
map
|
79
|
+
end
|
80
|
+
|
81
|
+
def filter_chain
|
82
|
+
filters = []
|
83
|
+
filters << HTMLParseFilter.new( :source.to_k, nil, :source_tree.to_k )
|
84
|
+
filters << TitleExtractor.new
|
85
|
+
filters << TextCtrlWSFilter.new( :title.to_k )
|
86
|
+
|
87
|
+
tfc = TreeFilterChain.new( [ MetaSkipFilter.new,
|
88
|
+
CharactersNormalizer.new,
|
89
|
+
WordCounter.new,
|
90
|
+
WordyCounter.new ] )
|
91
|
+
|
92
|
+
filters << HTMLTreeFilter.new( :source_tree.to_k, tfc, Order::DEPTH_FIRST )
|
93
|
+
|
94
|
+
filters << simhash_generator
|
95
|
+
|
96
|
+
FilterChain.new( "test", filters )
|
97
|
+
end
|
98
|
+
|
99
|
+
def hex( l )
|
100
|
+
Java::java.lang.Long::toHexString( l )
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
metadata
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iudex-simhash
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-04-04 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: iudex-html
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.0.0
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: minitest
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 1.7.1
|
36
|
+
- - <
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: "2.1"
|
39
|
+
type: :development
|
40
|
+
version_requirements: *id002
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rjack-logback
|
43
|
+
prerelease: false
|
44
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ~>
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "1.0"
|
50
|
+
type: :development
|
51
|
+
version_requirements: *id003
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: rjack-tarpit
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ~>
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.3.0
|
61
|
+
type: :development
|
62
|
+
version_requirements: *id004
|
63
|
+
description: |-
|
64
|
+
Iudex is a general purpose web crawler and feed processor in
|
65
|
+
ruby/java. The iudex-simhash gem contains support for generation and
|
66
|
+
searching over simhash fingerprints
|
67
|
+
email:
|
68
|
+
- dek-oss@gravitext.com
|
69
|
+
executables:
|
70
|
+
- iudex-simhash-perftest
|
71
|
+
extensions: []
|
72
|
+
|
73
|
+
extra_rdoc_files:
|
74
|
+
- Manifest.txt
|
75
|
+
- History.rdoc
|
76
|
+
- README.rdoc
|
77
|
+
files:
|
78
|
+
- History.rdoc
|
79
|
+
- Manifest.txt
|
80
|
+
- README.rdoc
|
81
|
+
- Rakefile
|
82
|
+
- pom.xml
|
83
|
+
- bin/iudex-simhash-perftest
|
84
|
+
- config/stopwords.en
|
85
|
+
- lib/iudex-simhash/base.rb
|
86
|
+
- lib/iudex-simhash.rb
|
87
|
+
- lib/iudex-simhash/factory_helper.rb
|
88
|
+
- lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb
|
89
|
+
- test/setup.rb
|
90
|
+
- test/test_fuzzy_set.rb
|
91
|
+
- test/test_simhash_generator.rb
|
92
|
+
- test/html/gentest.html
|
93
|
+
- lib/iudex-simhash/iudex-simhash-1.0.0.jar
|
94
|
+
has_rdoc: true
|
95
|
+
homepage: http://github.com/dekellum/iudex
|
96
|
+
licenses: []
|
97
|
+
|
98
|
+
post_install_message:
|
99
|
+
rdoc_options:
|
100
|
+
- --main
|
101
|
+
- README.rdoc
|
102
|
+
require_paths:
|
103
|
+
- lib
|
104
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: "0"
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: "0"
|
116
|
+
requirements: []
|
117
|
+
|
118
|
+
rubyforge_project: iudex-simhash
|
119
|
+
rubygems_version: 1.5.1
|
120
|
+
signing_key:
|
121
|
+
specification_version: 3
|
122
|
+
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|
123
|
+
test_files:
|
124
|
+
- test/test_fuzzy_set.rb
|
125
|
+
- test/test_simhash_generator.rb
|