iudex-simhash 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,35 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ #### General test setup: LOAD_PATH, logging, console output ####
18
+
19
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
20
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
21
+
22
+ require 'rubygems'
23
+
24
+ require 'rjack-logback'
25
+ RJack::Logback.config_console( :stderr => true )
26
+
27
+ require 'minitest/unit'
28
+ require 'minitest/autorun'
29
+
30
+ # Make test output logging compatible: no partial lines.
31
+ # class TestOut
32
+ # def print( *a ); $stdout.puts( *a ); end
33
+ # def puts( *a ); $stdout.puts( *a ); end
34
+ # end
35
+ # MiniTest::Unit.output = TestOut.new
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-simhash'
23
+
24
+ class TestFuzzySet < MiniTest::Unit::TestCase
25
+ include Iudex::SimHash::BruteFuzzy
26
+
27
+ # Series that will allow all but last at 3 bit threshold, all at 2
28
+ # bit threshold.
29
+ TEST_SERIES = [ %w[ FFFF_FFFF_FFFF_FFFF
30
+ 7FFF_7FFF_7FFF_7FFF
31
+ F7FF_F7FF_F7FF_F7FF
32
+ FF7F_FF7F_FF7F_FFFF ],
33
+
34
+ %w[ 0000_0000_0000_0000
35
+ 0100_1000_1000_0010
36
+ 1000_0100_0100_1000
37
+ 0010_0010_0010_0001
38
+ 0001_0001_0001_0000 ],
39
+
40
+ %w[ 0000_0000_0000_0000
41
+ 0010_0100_0100_0100
42
+ 0001_0000_1000_1001
43
+ 0100_1001_0001_0000
44
+ 0000_0010_0010_0010 ] ]
45
+
46
+ def test_hex
47
+ assert_equal( 0x1000_0000, hex( "1000_0000" ) )
48
+ assert_equal( 0x7FFF_FFFF_FFFF_FFFF, hex( "7FFF_FFFF_FFFF_FFFF" ) )
49
+ assert_equal( -1, hex( "FFFF_FFFF_FFFF_FFFF" ) )
50
+ end
51
+
52
+ def test_match
53
+ m = FuzzyList64.new( 100, 4 )
54
+ assert( m.fuzzy_match( 0, 0 ) )
55
+ assert( m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
56
+ hex( '7FFF_FFFF_7777_FFFF' ) ) )
57
+
58
+ assert( m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
59
+ hex( 'FFFF_FFFF_7777_FFFF' ) ) )
60
+
61
+ assert( ! m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
62
+ hex( '7FFF_FFFF_EFFF_7777' ) ) )
63
+
64
+ assert( ! m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
65
+ hex( 'FFFF_FFFF_EFFF_7777' ) ) )
66
+ end
67
+
68
+ def test_add
69
+ m = FuzzyList64.new( 100, 4 )
70
+ assert( m.addIfNotFound( 0x0 ) )
71
+ assert( m.addIfNotFound( 0xFF ) )
72
+ assert( ! m.addIfNotFound( 0xFE ) )
73
+ assert( ! m.addIfNotFound( 0x1 ) )
74
+ end
75
+
76
+ def test_series_list
77
+ assert_series( FuzzyList64 )
78
+ end
79
+
80
+ def test_series_tree
81
+ assert_series( FuzzyTree64 )
82
+ end
83
+
84
+ def assert_series( fclz )
85
+ TEST_SERIES.each do |s|
86
+ assert_series_last( fclz.new( 5, 3 ), s )
87
+ assert_series_all( fclz.new( 5, 2 ), s )
88
+ end
89
+ end
90
+
91
+ def assert_series_last( fset, s )
92
+ s = s.dup
93
+ last = s.pop # Remove last for now
94
+ assert_series_all( fset, s )
95
+ assert( ! fset.addIfNotFound( hex( last ) ), last )
96
+ end
97
+
98
+ def assert_series_all( fset, s )
99
+ s.each { |k| assert( fset.addIfNotFound( hex( k ) ), k ) }
100
+ end
101
+
102
+ def test_find_series_list
103
+ assert_find_series( FuzzyList64 )
104
+ end
105
+
106
+ def test_find_series_tree
107
+ assert_find_series( FuzzyTree64 )
108
+ end
109
+
110
+ def assert_find_series( fclz )
111
+ TEST_SERIES.each do |s|
112
+ assert_find_series_last( fclz.new( 5, 3 ), s )
113
+ assert_find_series_all( fclz.new( 5, 2 ), s )
114
+ end
115
+ end
116
+
117
+ def assert_find_series_last( fset, s )
118
+ s = s.dup
119
+ last = s.pop # Remove last for now
120
+ assert_find_series_all( fset, s )
121
+ l = Java::java.util.ArrayList.new;
122
+ assert( ! fset.addFindAll( hex( last ), l ) )
123
+ assert( l.size(), 1 );
124
+
125
+ # Remove the match and try again.
126
+ assert( fset.remove( l.get( 0 ) ), "remove match" )
127
+ assert( fset.remove( hex( last ) ), "remove last" )
128
+ l.clear
129
+ assert( ! fset.addFindAll( hex( last ), l ) )
130
+ assert( l.empty? )
131
+ end
132
+
133
+ def assert_find_series_all( fset, s )
134
+ s.each do |k|
135
+ l = Java::java.util.ArrayList.new;
136
+ assert( ! fset.addFindAll( hex( k ), l ) )
137
+ assert( l.empty? )
138
+ end
139
+ end
140
+
141
+ def hex( h )
142
+ BruteFuzzy::unsignedHexToLong( h.gsub( /_/, '' ) )
143
+ end
144
+ end
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+ require 'iudex-simhash'
23
+ require 'iudex-simhash/factory_helper'
24
+
25
+ class TestSimhashGenerator < MiniTest::Unit::TestCase
26
+ include Gravitext::HTMap
27
+ include Iudex::Core
28
+ include Iudex::Core::Filters
29
+ include Iudex::HTML
30
+ include Iudex::HTML::Filters
31
+ include Iudex::HTML::Tree
32
+ include Iudex::HTML::Tree::Filters
33
+ include Iudex::Filter::Core
34
+ include Iudex::SimHash::Filters
35
+
36
+ include Iudex::SimHash::Filters::FactoryHelper
37
+
38
+ import 'iudex.html.HTMLUtils'
39
+
40
+ UniMap.define_accessors
41
+
42
+ Order = HTMLTreeFilter::Order
43
+
44
+ def test_default_stopwords
45
+ stopwords = simhash_stopwords
46
+ assert( stopwords.contains( 'from' ) )
47
+ end
48
+
49
+ def test_generate
50
+ html = <<HTML
51
+ <html>
52
+ <head>
53
+ <title>Title</title>
54
+ </head>
55
+ <body>
56
+ <p>We are talking about the same thing here.</p>
57
+ <p>Really this is the same exact thing I was telling you last time.</p>
58
+ <p>cruft</p> <!-- Ignored by default 0.3 wordy ratio -->
59
+ </body>
60
+ </html>
61
+ HTML
62
+
63
+ map = content( html )
64
+ assert( filter_chain.filter( map ) )
65
+ assert_equal( 'Title', map.title.to_s )
66
+ assert_equal( 'eaa4172924c0bf6e', hex( map.simhash ) )
67
+
68
+ html.gsub!( /the/, "\t" ) # Removing stop words doesn't matter
69
+ html.gsub!( /cruft/, "xcruft" ) # cruft by any other name...
70
+ map = content( html )
71
+ assert( filter_chain.filter( map ) )
72
+ assert_equal( 'eaa4172924c0bf6e', hex( map.simhash ) )
73
+ end
74
+
75
+ def content( html, charset = "UTF-8" )
76
+ map = UniMap.new
77
+ map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
78
+ map
79
+ end
80
+
81
+ def filter_chain
82
+ filters = []
83
+ filters << HTMLParseFilter.new( :source.to_k, nil, :source_tree.to_k )
84
+ filters << TitleExtractor.new
85
+ filters << TextCtrlWSFilter.new( :title.to_k )
86
+
87
+ tfc = TreeFilterChain.new( [ MetaSkipFilter.new,
88
+ CharactersNormalizer.new,
89
+ WordCounter.new,
90
+ WordyCounter.new ] )
91
+
92
+ filters << HTMLTreeFilter.new( :source_tree.to_k, tfc, Order::DEPTH_FIRST )
93
+
94
+ filters << simhash_generator
95
+
96
+ FilterChain.new( "test", filters )
97
+ end
98
+
99
+ def hex( l )
100
+ Java::java.lang.Long::toHexString( l )
101
+ end
102
+
103
+ end
metadata ADDED
@@ -0,0 +1,125 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iudex-simhash
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: java
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-04 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: iudex-html
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.0.0
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: 1.7.1
36
+ - - <
37
+ - !ruby/object:Gem::Version
38
+ version: "2.1"
39
+ type: :development
40
+ version_requirements: *id002
41
+ - !ruby/object:Gem::Dependency
42
+ name: rjack-logback
43
+ prerelease: false
44
+ requirement: &id003 !ruby/object:Gem::Requirement
45
+ none: false
46
+ requirements:
47
+ - - ~>
48
+ - !ruby/object:Gem::Version
49
+ version: "1.0"
50
+ type: :development
51
+ version_requirements: *id003
52
+ - !ruby/object:Gem::Dependency
53
+ name: rjack-tarpit
54
+ prerelease: false
55
+ requirement: &id004 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ~>
59
+ - !ruby/object:Gem::Version
60
+ version: 1.3.0
61
+ type: :development
62
+ version_requirements: *id004
63
+ description: |-
64
+ Iudex is a general purpose web crawler and feed processor in
65
+ ruby/java. The iudex-simhash gem contains support for generation and
66
+ searching over simhash fingerprints
67
+ email:
68
+ - dek-oss@gravitext.com
69
+ executables:
70
+ - iudex-simhash-perftest
71
+ extensions: []
72
+
73
+ extra_rdoc_files:
74
+ - Manifest.txt
75
+ - History.rdoc
76
+ - README.rdoc
77
+ files:
78
+ - History.rdoc
79
+ - Manifest.txt
80
+ - README.rdoc
81
+ - Rakefile
82
+ - pom.xml
83
+ - bin/iudex-simhash-perftest
84
+ - config/stopwords.en
85
+ - lib/iudex-simhash/base.rb
86
+ - lib/iudex-simhash.rb
87
+ - lib/iudex-simhash/factory_helper.rb
88
+ - lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb
89
+ - test/setup.rb
90
+ - test/test_fuzzy_set.rb
91
+ - test/test_simhash_generator.rb
92
+ - test/html/gentest.html
93
+ - lib/iudex-simhash/iudex-simhash-1.0.0.jar
94
+ has_rdoc: true
95
+ homepage: http://github.com/dekellum/iudex
96
+ licenses: []
97
+
98
+ post_install_message:
99
+ rdoc_options:
100
+ - --main
101
+ - README.rdoc
102
+ require_paths:
103
+ - lib
104
+ required_ruby_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: "0"
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: "0"
116
+ requirements: []
117
+
118
+ rubyforge_project: iudex-simhash
119
+ rubygems_version: 1.5.1
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java
123
+ test_files:
124
+ - test/test_fuzzy_set.rb
125
+ - test/test_simhash_generator.rb