iudex-simhash 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ #### General test setup: LOAD_PATH, logging, console output ####
18
+
19
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
20
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
21
+
22
+ require 'rubygems'
23
+
24
+ require 'rjack-logback'
25
+ RJack::Logback.config_console( :stderr => true )
26
+
27
+ require 'minitest/unit'
28
+ require 'minitest/autorun'
29
+
30
+ # Make test output logging compatible: no partial lines.
31
+ # class TestOut
32
+ # def print( *a ); $stdout.puts( *a ); end
33
+ # def puts( *a ); $stdout.puts( *a ); end
34
+ # end
35
+ # MiniTest::Unit.output = TestOut.new
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-simhash'
23
+
24
+ class TestFuzzySet < MiniTest::Unit::TestCase
25
+ include Iudex::SimHash::BruteFuzzy
26
+
27
+ # Series that will allow all but last at 3 bit threshold, all at 2
28
+ # bit threshold.
29
+ TEST_SERIES = [ %w[ FFFF_FFFF_FFFF_FFFF
30
+ 7FFF_7FFF_7FFF_7FFF
31
+ F7FF_F7FF_F7FF_F7FF
32
+ FF7F_FF7F_FF7F_FFFF ],
33
+
34
+ %w[ 0000_0000_0000_0000
35
+ 0100_1000_1000_0010
36
+ 1000_0100_0100_1000
37
+ 0010_0010_0010_0001
38
+ 0001_0001_0001_0000 ],
39
+
40
+ %w[ 0000_0000_0000_0000
41
+ 0010_0100_0100_0100
42
+ 0001_0000_1000_1001
43
+ 0100_1001_0001_0000
44
+ 0000_0010_0010_0010 ] ]
45
+
46
+ def test_hex
47
+ assert_equal( 0x1000_0000, hex( "1000_0000" ) )
48
+ assert_equal( 0x7FFF_FFFF_FFFF_FFFF, hex( "7FFF_FFFF_FFFF_FFFF" ) )
49
+ assert_equal( -1, hex( "FFFF_FFFF_FFFF_FFFF" ) )
50
+ end
51
+
52
+ def test_match
53
+ m = FuzzyList64.new( 100, 4 )
54
+ assert( m.fuzzy_match( 0, 0 ) )
55
+ assert( m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
56
+ hex( '7FFF_FFFF_7777_FFFF' ) ) )
57
+
58
+ assert( m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
59
+ hex( 'FFFF_FFFF_7777_FFFF' ) ) )
60
+
61
+ assert( ! m.fuzzy_match( hex( '7FFF_FFFF_FFFF_FFFF' ),
62
+ hex( '7FFF_FFFF_EFFF_7777' ) ) )
63
+
64
+ assert( ! m.fuzzy_match( hex( 'FFFF_FFFF_FFFF_FFFF' ),
65
+ hex( 'FFFF_FFFF_EFFF_7777' ) ) )
66
+ end
67
+
68
+ def test_add
69
+ m = FuzzyList64.new( 100, 4 )
70
+ assert( m.addIfNotFound( 0x0 ) )
71
+ assert( m.addIfNotFound( 0xFF ) )
72
+ assert( ! m.addIfNotFound( 0xFE ) )
73
+ assert( ! m.addIfNotFound( 0x1 ) )
74
+ end
75
+
76
+ def test_series_list
77
+ assert_series( FuzzyList64 )
78
+ end
79
+
80
+ def test_series_tree
81
+ assert_series( FuzzyTree64 )
82
+ end
83
+
84
+ def assert_series( fclz )
85
+ TEST_SERIES.each do |s|
86
+ assert_series_last( fclz.new( 5, 3 ), s )
87
+ assert_series_all( fclz.new( 5, 2 ), s )
88
+ end
89
+ end
90
+
91
+ def assert_series_last( fset, s )
92
+ s = s.dup
93
+ last = s.pop # Remove last for now
94
+ assert_series_all( fset, s )
95
+ assert( ! fset.addIfNotFound( hex( last ) ), last )
96
+ end
97
+
98
+ def assert_series_all( fset, s )
99
+ s.each { |k| assert( fset.addIfNotFound( hex( k ) ), k ) }
100
+ end
101
+
102
+ def test_find_series_list
103
+ assert_find_series( FuzzyList64 )
104
+ end
105
+
106
+ def test_find_series_tree
107
+ assert_find_series( FuzzyTree64 )
108
+ end
109
+
110
+ def assert_find_series( fclz )
111
+ TEST_SERIES.each do |s|
112
+ assert_find_series_last( fclz.new( 5, 3 ), s )
113
+ assert_find_series_all( fclz.new( 5, 2 ), s )
114
+ end
115
+ end
116
+
117
+ def assert_find_series_last( fset, s )
118
+ s = s.dup
119
+ last = s.pop # Remove last for now
120
+ assert_find_series_all( fset, s )
121
+ l = Java::java.util.ArrayList.new;
122
+ assert( ! fset.addFindAll( hex( last ), l ) )
123
+ assert( l.size(), 1 );
124
+
125
+ # Remove the match and try again.
126
+ assert( fset.remove( l.get( 0 ) ), "remove match" )
127
+ assert( fset.remove( hex( last ) ), "remove last" )
128
+ l.clear
129
+ assert( ! fset.addFindAll( hex( last ), l ) )
130
+ assert( l.empty? )
131
+ end
132
+
133
+ def assert_find_series_all( fset, s )
134
+ s.each do |k|
135
+ l = Java::java.util.ArrayList.new;
136
+ assert( ! fset.addFindAll( hex( k ), l ) )
137
+ assert( l.empty? )
138
+ end
139
+ end
140
+
141
+ def hex( h )
142
+ BruteFuzzy::unsignedHexToLong( h.gsub( /_/, '' ) )
143
+ end
144
+ end
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+ require 'iudex-simhash'
23
+ require 'iudex-simhash/factory_helper'
24
+
25
+ class TestSimhashGenerator < MiniTest::Unit::TestCase
26
+ include Gravitext::HTMap
27
+ include Iudex::Core
28
+ include Iudex::Core::Filters
29
+ include Iudex::HTML
30
+ include Iudex::HTML::Filters
31
+ include Iudex::HTML::Tree
32
+ include Iudex::HTML::Tree::Filters
33
+ include Iudex::Filter::Core
34
+ include Iudex::SimHash::Filters
35
+
36
+ include Iudex::SimHash::Filters::FactoryHelper
37
+
38
+ import 'iudex.html.HTMLUtils'
39
+
40
+ UniMap.define_accessors
41
+
42
+ Order = HTMLTreeFilter::Order
43
+
44
+ def test_default_stopwords
45
+ stopwords = simhash_stopwords
46
+ assert( stopwords.contains( 'from' ) )
47
+ end
48
+
49
+ def test_generate
50
+ html = <<HTML
51
+ <html>
52
+ <head>
53
+ <title>Title</title>
54
+ </head>
55
+ <body>
56
+ <p>We are talking about the same thing here.</p>
57
+ <p>Really this is the same exact thing I was telling you last time.</p>
58
+ <p>cruft</p> <!-- Ignored by default 0.3 wordy ratio -->
59
+ </body>
60
+ </html>
61
+ HTML
62
+
63
+ map = content( html )
64
+ assert( filter_chain.filter( map ) )
65
+ assert_equal( 'Title', map.title.to_s )
66
+ assert_equal( 'eaa4172924c0bf6e', hex( map.simhash ) )
67
+
68
+ html.gsub!( /the/, "\t" ) # Removing stop words doesn't matter
69
+ html.gsub!( /cruft/, "xcruft" ) # cruft by any other name...
70
+ map = content( html )
71
+ assert( filter_chain.filter( map ) )
72
+ assert_equal( 'eaa4172924c0bf6e', hex( map.simhash ) )
73
+ end
74
+
75
+ def content( html, charset = "UTF-8" )
76
+ map = UniMap.new
77
+ map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
78
+ map
79
+ end
80
+
81
+ def filter_chain
82
+ filters = []
83
+ filters << HTMLParseFilter.new( :source.to_k, nil, :source_tree.to_k )
84
+ filters << TitleExtractor.new
85
+ filters << TextCtrlWSFilter.new( :title.to_k )
86
+
87
+ tfc = TreeFilterChain.new( [ MetaSkipFilter.new,
88
+ CharactersNormalizer.new,
89
+ WordCounter.new,
90
+ WordyCounter.new ] )
91
+
92
+ filters << HTMLTreeFilter.new( :source_tree.to_k, tfc, Order::DEPTH_FIRST )
93
+
94
+ filters << simhash_generator
95
+
96
+ FilterChain.new( "test", filters )
97
+ end
98
+
99
+ def hex( l )
100
+ Java::java.lang.Long::toHexString( l )
101
+ end
102
+
103
+ end
metadata ADDED
@@ -0,0 +1,125 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iudex-simhash
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: java
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-04 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: iudex-html
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.0.0
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: 1.7.1
36
+ - - <
37
+ - !ruby/object:Gem::Version
38
+ version: "2.1"
39
+ type: :development
40
+ version_requirements: *id002
41
+ - !ruby/object:Gem::Dependency
42
+ name: rjack-logback
43
+ prerelease: false
44
+ requirement: &id003 !ruby/object:Gem::Requirement
45
+ none: false
46
+ requirements:
47
+ - - ~>
48
+ - !ruby/object:Gem::Version
49
+ version: "1.0"
50
+ type: :development
51
+ version_requirements: *id003
52
+ - !ruby/object:Gem::Dependency
53
+ name: rjack-tarpit
54
+ prerelease: false
55
+ requirement: &id004 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ~>
59
+ - !ruby/object:Gem::Version
60
+ version: 1.3.0
61
+ type: :development
62
+ version_requirements: *id004
63
+ description: |-
64
+ Iudex is a general purpose web crawler and feed processor in
65
+ ruby/java. The iudex-simhash gem contains support for generation and
66
+ searching over simhash fingerprints
67
+ email:
68
+ - dek-oss@gravitext.com
69
+ executables:
70
+ - iudex-simhash-perftest
71
+ extensions: []
72
+
73
+ extra_rdoc_files:
74
+ - Manifest.txt
75
+ - History.rdoc
76
+ - README.rdoc
77
+ files:
78
+ - History.rdoc
79
+ - Manifest.txt
80
+ - README.rdoc
81
+ - Rakefile
82
+ - pom.xml
83
+ - bin/iudex-simhash-perftest
84
+ - config/stopwords.en
85
+ - lib/iudex-simhash/base.rb
86
+ - lib/iudex-simhash.rb
87
+ - lib/iudex-simhash/factory_helper.rb
88
+ - lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb
89
+ - test/setup.rb
90
+ - test/test_fuzzy_set.rb
91
+ - test/test_simhash_generator.rb
92
+ - test/html/gentest.html
93
+ - lib/iudex-simhash/iudex-simhash-1.0.0.jar
94
+ has_rdoc: true
95
+ homepage: http://github.com/dekellum/iudex
96
+ licenses: []
97
+
98
+ post_install_message:
99
+ rdoc_options:
100
+ - --main
101
+ - README.rdoc
102
+ require_paths:
103
+ - lib
104
+ required_ruby_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: "0"
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: "0"
116
+ requirements: []
117
+
118
+ rubyforge_project: iudex-simhash
119
+ rubygems_version: 1.5.1
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java
123
+ test_files:
124
+ - test/test_fuzzy_set.rb
125
+ - test/test_simhash_generator.rb