iudex-simhash 1.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +2 -0
- data/Manifest.txt +16 -0
- data/README.rdoc +33 -0
- data/Rakefile +39 -0
- data/bin/iudex-simhash-perftest +104 -0
- data/config/stopwords.en +35 -0
- data/lib/iudex-simhash.rb +44 -0
- data/lib/iudex-simhash/base.rb +21 -0
- data/lib/iudex-simhash/factory_helper.rb +65 -0
- data/lib/iudex-simhash/iudex-simhash-1.0.0.jar +0 -0
- data/lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb +80 -0
- data/pom.xml +44 -0
- data/test/html/gentest.html +1447 -0
- data/test/setup.rb +35 -0
- data/test/test_fuzzy_set.rb +144 -0
- data/test/test_simhash_generator.rb +103 -0
- metadata +125 -0
data/test/setup.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
#### General test setup: LOAD_PATH, logging, console output ####
|
18
|
+
|
19
|
+
ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
|
20
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
21
|
+
|
22
|
+
require 'rubygems'
|
23
|
+
|
24
|
+
require 'rjack-logback'
|
25
|
+
RJack::Logback.config_console( :stderr => true )
|
26
|
+
|
27
|
+
require 'minitest/unit'
|
28
|
+
require 'minitest/autorun'
|
29
|
+
|
30
|
+
# Make test output logging compatible: no partial lines.
|
31
|
+
# class TestOut
|
32
|
+
# def print( *a ); $stdout.puts( *a ); end
|
33
|
+
# def puts( *a ); $stdout.puts( *a ); end
|
34
|
+
# end
|
35
|
+
# MiniTest::Unit.output = TestOut.new
|
@@ -0,0 +1,144 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2010-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You may
|
9
|
+
# obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
require 'iudex-simhash'
|
23
|
+
|
24
|
+
class TestFuzzySet < MiniTest::Unit::TestCase
|
25
|
+
include Iudex::SimHash::BruteFuzzy
|
26
|
+
|
27
|
+
# Series that will allow all but last at 3 bit threshold, all at 2
|
28
|
+
# bit threshold.
|
29
|
+
TEST_SERIES = [ %w[ FFFF_FFFF_FFFF_FFFF
|
30
|
+
7FFF_7FFF_7FFF_7FFF
|
31
|
+
F7FF_F7FF_F7FF_F7FF
|
32
|
+
FF7F_FF7F_FF7F_FFFF ],
|
33
|
+
|
34
|
+
%w[ 0000_0000_0000_0000
|
35
|
+
0100_1000_1000_0010
|
36
|
+
1000_0100_0100_1000
|
37
|
+
0010_0010_0010_0001
|
38
|
+
0001_0001_0001_0000 ],
|
39
|
+
|
40
|
+
%w[ 0000_0000_0000_0000
|
41
|
+
0010_0100_0100_0100
|
42
|
+
0001_0000_1000_1001
|
43
|
+
0100_1001_0001_0000
|
44
|
+
0000_0010_0010_0010 ] ]
|
45
|
+
|
46
|
+
def test_hex
|
47
|
+
assert_equal( 0x1000_0000, hex( "1000_0000" ) )
|
48
|
+
assert_equal( 0x7FFF_FFFF_FFFF_FFFF, hex( "7FFF_FFFF_FFFF_FFFF" ) )
|
49
|
+
assert_equal( -1, hex( "FFFF_FFFF_FFFF_FFFF" ) )
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_match
|
53
|
+
m = FuzzyList64.new( 100, 4 )
|
54
|
+
assert( m.fuzzy_match( 0, 0 ) )
|
55
|
+
assert( m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
|
56
|
+
hex( '7FFF_FFFF_7777_FFFF' ) ) )
|
57
|
+
|
58
|
+
assert( m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
|
59
|
+
hex( 'FFFF_FFFF_7777_FFFF' ) ) )
|
60
|
+
|
61
|
+
assert( ! m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
|
62
|
+
hex( '7FFF_FFFF_EFFF_7777' ) ) )
|
63
|
+
|
64
|
+
assert( ! m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
|
65
|
+
hex( 'FFFF_FFFF_EFFF_7777' ) ) )
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_add
|
69
|
+
m = FuzzyList64.new( 100, 4 )
|
70
|
+
assert( m.addIfNotFound( 0x0 ) )
|
71
|
+
assert( m.addIfNotFound( 0xFF ) )
|
72
|
+
assert( ! m.addIfNotFound( 0xFE ) )
|
73
|
+
assert( ! m.addIfNotFound( 0x1 ) )
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_series_list
|
77
|
+
assert_series( FuzzyList64 )
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_series_tree
|
81
|
+
assert_series( FuzzyTree64 )
|
82
|
+
end
|
83
|
+
|
84
|
+
def assert_series( fclz )
|
85
|
+
TEST_SERIES.each do |s|
|
86
|
+
assert_series_last( fclz.new( 5, 3 ), s )
|
87
|
+
assert_series_all( fclz.new( 5, 2 ), s )
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def assert_series_last( fset, s )
|
92
|
+
s = s.dup
|
93
|
+
last = s.pop # Remove last for now
|
94
|
+
assert_series_all( fset, s )
|
95
|
+
assert( ! fset.addIfNotFound( hex( last ) ), last )
|
96
|
+
end
|
97
|
+
|
98
|
+
def assert_series_all( fset, s )
|
99
|
+
s.each { |k| assert( fset.addIfNotFound( hex( k ) ), k ) }
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_find_series_list
|
103
|
+
assert_find_series( FuzzyList64 )
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_find_series_tree
|
107
|
+
assert_find_series( FuzzyTree64 )
|
108
|
+
end
|
109
|
+
|
110
|
+
def assert_find_series( fclz )
|
111
|
+
TEST_SERIES.each do |s|
|
112
|
+
assert_find_series_last( fclz.new( 5, 3 ), s )
|
113
|
+
assert_find_series_all( fclz.new( 5, 2 ), s )
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def assert_find_series_last( fset, s )
|
118
|
+
s = s.dup
|
119
|
+
last = s.pop # Remove last for now
|
120
|
+
assert_find_series_all( fset, s )
|
121
|
+
l = Java::java.util.ArrayList.new;
|
122
|
+
assert( ! fset.addFindAll( hex( last ), l ) )
|
123
|
+
assert( l.size(), 1 );
|
124
|
+
|
125
|
+
# Remove the match and try again.
|
126
|
+
assert( fset.remove( l.get( 0 ) ), "remove match" )
|
127
|
+
assert( fset.remove( hex( last ) ), "remove last" )
|
128
|
+
l.clear
|
129
|
+
assert( ! fset.addFindAll( hex( last ), l ) )
|
130
|
+
assert( l.empty? )
|
131
|
+
end
|
132
|
+
|
133
|
+
def assert_find_series_all( fset, s )
|
134
|
+
s.each do |k|
|
135
|
+
l = Java::java.util.ArrayList.new;
|
136
|
+
assert( ! fset.addFindAll( hex( k ), l ) )
|
137
|
+
assert( l.empty? )
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def hex( h )
|
142
|
+
BruteFuzzy::unsignedHexToLong( h.gsub( /_/, '' ) )
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2010-2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
require 'iudex-simhash'
|
23
|
+
require 'iudex-simhash/factory_helper'
|
24
|
+
|
25
|
+
class TestSimhashGenerator < MiniTest::Unit::TestCase
|
26
|
+
include Gravitext::HTMap
|
27
|
+
include Iudex::Core
|
28
|
+
include Iudex::Core::Filters
|
29
|
+
include Iudex::HTML
|
30
|
+
include Iudex::HTML::Filters
|
31
|
+
include Iudex::HTML::Tree
|
32
|
+
include Iudex::HTML::Tree::Filters
|
33
|
+
include Iudex::Filter::Core
|
34
|
+
include Iudex::SimHash::Filters
|
35
|
+
|
36
|
+
include Iudex::SimHash::Filters::FactoryHelper
|
37
|
+
|
38
|
+
import 'iudex.html.HTMLUtils'
|
39
|
+
|
40
|
+
UniMap.define_accessors
|
41
|
+
|
42
|
+
Order = HTMLTreeFilter::Order
|
43
|
+
|
44
|
+
def test_default_stopwords
|
45
|
+
stopwords = simhash_stopwords
|
46
|
+
assert( stopwords.contains( 'from' ) )
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_generate
|
50
|
+
html = <<HTML
|
51
|
+
<html>
|
52
|
+
<head>
|
53
|
+
<title>Title</title>
|
54
|
+
</head>
|
55
|
+
<body>
|
56
|
+
<p>We are talking about the same thing here.</p>
|
57
|
+
<p>Really this is the same exact thing I was telling you last time.</p>
|
58
|
+
<p>cruft</p> <!-- Ignored by default 0.3 wordy ratio -->
|
59
|
+
</body>
|
60
|
+
</html>
|
61
|
+
HTML
|
62
|
+
|
63
|
+
map = content( html )
|
64
|
+
assert( filter_chain.filter( map ) )
|
65
|
+
assert_equal( 'Title', map.title.to_s )
|
66
|
+
assert_equal( 'eaa4172924c0bf6e', hex( map.simhash ) )
|
67
|
+
|
68
|
+
html.gsub!( /the/, "\t" ) # Removing stop words doesn't matter
|
69
|
+
html.gsub!( /cruft/, "xcruft" ) # cruft by any other name...
|
70
|
+
map = content( html )
|
71
|
+
assert( filter_chain.filter( map ) )
|
72
|
+
assert_equal( 'eaa4172924c0bf6e', hex( map.simhash ) )
|
73
|
+
end
|
74
|
+
|
75
|
+
def content( html, charset = "UTF-8" )
|
76
|
+
map = UniMap.new
|
77
|
+
map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
|
78
|
+
map
|
79
|
+
end
|
80
|
+
|
81
|
+
def filter_chain
|
82
|
+
filters = []
|
83
|
+
filters << HTMLParseFilter.new( :source.to_k, nil, :source_tree.to_k )
|
84
|
+
filters << TitleExtractor.new
|
85
|
+
filters << TextCtrlWSFilter.new( :title.to_k )
|
86
|
+
|
87
|
+
tfc = TreeFilterChain.new( [ MetaSkipFilter.new,
|
88
|
+
CharactersNormalizer.new,
|
89
|
+
WordCounter.new,
|
90
|
+
WordyCounter.new ] )
|
91
|
+
|
92
|
+
filters << HTMLTreeFilter.new( :source_tree.to_k, tfc, Order::DEPTH_FIRST )
|
93
|
+
|
94
|
+
filters << simhash_generator
|
95
|
+
|
96
|
+
FilterChain.new( "test", filters )
|
97
|
+
end
|
98
|
+
|
99
|
+
def hex( l )
|
100
|
+
Java::java.lang.Long::toHexString( l )
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
metadata
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iudex-simhash
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-04-04 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: iudex-html
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.0.0
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: minitest
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 1.7.1
|
36
|
+
- - <
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: "2.1"
|
39
|
+
type: :development
|
40
|
+
version_requirements: *id002
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rjack-logback
|
43
|
+
prerelease: false
|
44
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ~>
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "1.0"
|
50
|
+
type: :development
|
51
|
+
version_requirements: *id003
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: rjack-tarpit
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ~>
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.3.0
|
61
|
+
type: :development
|
62
|
+
version_requirements: *id004
|
63
|
+
description: |-
|
64
|
+
Iudex is a general purpose web crawler and feed processor in
|
65
|
+
ruby/java. The iudex-simhash gem contains support for generation and
|
66
|
+
searching over simhash fingerprints
|
67
|
+
email:
|
68
|
+
- dek-oss@gravitext.com
|
69
|
+
executables:
|
70
|
+
- iudex-simhash-perftest
|
71
|
+
extensions: []
|
72
|
+
|
73
|
+
extra_rdoc_files:
|
74
|
+
- Manifest.txt
|
75
|
+
- History.rdoc
|
76
|
+
- README.rdoc
|
77
|
+
files:
|
78
|
+
- History.rdoc
|
79
|
+
- Manifest.txt
|
80
|
+
- README.rdoc
|
81
|
+
- Rakefile
|
82
|
+
- pom.xml
|
83
|
+
- bin/iudex-simhash-perftest
|
84
|
+
- config/stopwords.en
|
85
|
+
- lib/iudex-simhash/base.rb
|
86
|
+
- lib/iudex-simhash.rb
|
87
|
+
- lib/iudex-simhash/factory_helper.rb
|
88
|
+
- lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb
|
89
|
+
- test/setup.rb
|
90
|
+
- test/test_fuzzy_set.rb
|
91
|
+
- test/test_simhash_generator.rb
|
92
|
+
- test/html/gentest.html
|
93
|
+
- lib/iudex-simhash/iudex-simhash-1.0.0.jar
|
94
|
+
has_rdoc: true
|
95
|
+
homepage: http://github.com/dekellum/iudex
|
96
|
+
licenses: []
|
97
|
+
|
98
|
+
post_install_message:
|
99
|
+
rdoc_options:
|
100
|
+
- --main
|
101
|
+
- README.rdoc
|
102
|
+
require_paths:
|
103
|
+
- lib
|
104
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: "0"
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: "0"
|
116
|
+
requirements: []
|
117
|
+
|
118
|
+
rubyforge_project: iudex-simhash
|
119
|
+
rubygems_version: 1.5.1
|
120
|
+
signing_key:
|
121
|
+
specification_version: 3
|
122
|
+
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|
123
|
+
test_files:
|
124
|
+
- test/test_fuzzy_set.rb
|
125
|
+
- test/test_simhash_generator.rb
|