iudex-simhash 1.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +2 -0
- data/Manifest.txt +16 -0
- data/README.rdoc +33 -0
- data/Rakefile +39 -0
- data/bin/iudex-simhash-perftest +104 -0
- data/config/stopwords.en +35 -0
- data/lib/iudex-simhash.rb +44 -0
- data/lib/iudex-simhash/base.rb +21 -0
- data/lib/iudex-simhash/factory_helper.rb +65 -0
- data/lib/iudex-simhash/iudex-simhash-1.0.0.jar +0 -0
- data/lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb +80 -0
- data/pom.xml +44 -0
- data/test/html/gentest.html +1447 -0
- data/test/setup.rb +35 -0
- data/test/test_fuzzy_set.rb +144 -0
- data/test/test_simhash_generator.rb +103 -0
- metadata +125 -0
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
History.rdoc
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
pom.xml
|
6
|
+
bin/iudex-simhash-perftest
|
7
|
+
config/stopwords.en
|
8
|
+
lib/iudex-simhash/base.rb
|
9
|
+
lib/iudex-simhash.rb
|
10
|
+
lib/iudex-simhash/factory_helper.rb
|
11
|
+
lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb
|
12
|
+
test/setup.rb
|
13
|
+
test/test_fuzzy_set.rb
|
14
|
+
test/test_simhash_generator.rb
|
15
|
+
test/html/gentest.html
|
16
|
+
lib/iudex-simhash/iudex-simhash-1.0.0.jar
|
data/README.rdoc
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
= iudex-simhash
|
2
|
+
|
3
|
+
* http://github.com/dekellum/iudex
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
Iudex is a general purpose web crawler and feed processor in
|
8
|
+
ruby/java. The iudex-simhash gem contains support for generation and
|
9
|
+
searching over simhash fingerprints
|
10
|
+
|
11
|
+
== Dependencies
|
12
|
+
|
13
|
+
* Java 1.5+
|
14
|
+
|
15
|
+
For tests:
|
16
|
+
|
17
|
+
* JRuby 1.3+
|
18
|
+
|
19
|
+
== License
|
20
|
+
|
21
|
+
Copyright (c) 2010-2011 David Kellum
|
22
|
+
|
23
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
24
|
+
may not use this file except in compliance with the License. You
|
25
|
+
may obtain a copy of the License at:
|
26
|
+
|
27
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
28
|
+
|
29
|
+
Unless required by applicable law or agreed to in writing, software
|
30
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
31
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
32
|
+
implied. See the License for the specific language governing
|
33
|
+
permissions and limitations under the License.
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
$LOAD_PATH << './lib'
|
4
|
+
require 'iudex-simhash/base'
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
gem 'rjack-tarpit', '~> 1.2'
|
8
|
+
require 'rjack-tarpit'
|
9
|
+
|
10
|
+
t = RJack::TarPit.new( 'iudex-simhash',
|
11
|
+
Iudex::SimHash::VERSION,
|
12
|
+
:no_assembly, :java_platform )
|
13
|
+
|
14
|
+
t.specify do |h|
|
15
|
+
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
16
|
+
h.extra_deps += [ [ 'iudex-html', '~> 1.0.0' ] ]
|
17
|
+
|
18
|
+
h.testlib = :minitest
|
19
|
+
h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
|
20
|
+
[ 'rjack-logback', '~> 1.0' ] ]
|
21
|
+
end
|
22
|
+
|
23
|
+
file 'Manifest.txt' => [ 'pom.xml' ]
|
24
|
+
|
25
|
+
task :check_pom_version do
|
26
|
+
t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
|
27
|
+
end
|
28
|
+
task :check_history_version do
|
29
|
+
t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
|
30
|
+
end
|
31
|
+
task :check_history_date do
|
32
|
+
t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
|
33
|
+
end
|
34
|
+
|
35
|
+
task :gem => [ :check_pom_version, :check_history_version ]
|
36
|
+
task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
|
37
|
+
task :push => [ :check_history_date ]
|
38
|
+
|
39
|
+
t.define_tasks
|
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
#.hashdot.vm.options += -Xmx1g
|
4
|
+
# For 64b add: -XX:+UseCompressedOops
|
5
|
+
|
6
|
+
#--
|
7
|
+
# Copyright (c) 2010-2011 David Kellum
|
8
|
+
#
|
9
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
10
|
+
# may not use this file except in compliance with the License. You may
|
11
|
+
# obtain a copy of the License at
|
12
|
+
#
|
13
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14
|
+
#
|
15
|
+
# Unless required by applicable law or agreed to in writing, software
|
16
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
18
|
+
# implied. See the License for the specific language governing
|
19
|
+
# permissions and limitations under the License.
|
20
|
+
#++
|
21
|
+
|
22
|
+
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
23
|
+
|
24
|
+
require 'rubygems'
|
25
|
+
require 'rjack-logback'
|
26
|
+
|
27
|
+
require 'iudex-simhash'
|
28
|
+
require 'iudex-simhash/sim_hash_gen_perf_test_factory'
|
29
|
+
|
30
|
+
require 'optparse'
|
31
|
+
|
32
|
+
require 'gravitext-util'
|
33
|
+
require 'gravitext-util/perftest'
|
34
|
+
|
35
|
+
module PerfTest
|
36
|
+
include Iudex::SimHash::BruteFuzzy
|
37
|
+
import 'iudex.simhash.brutefuzzy.FuzzySetPerfTest'
|
38
|
+
|
39
|
+
MD = FuzzySetPerfTest::Mode
|
40
|
+
|
41
|
+
def self.run( args = ARGV )
|
42
|
+
|
43
|
+
options = {}
|
44
|
+
oparser = OptionParser.new do |opts|
|
45
|
+
opts.banner = <<USAGE
|
46
|
+
Usage: iudex-simhash-perftest (fuzzy|gen) [options] <length> <threshold-bits>
|
47
|
+
USAGE
|
48
|
+
opts.on( "-t", "--threads N", Integer, "Test with thread count" ) do |n|
|
49
|
+
options[ :threads ] = n
|
50
|
+
end
|
51
|
+
opts.on( "--tree-bits=N", Integer,
|
52
|
+
"Test TREE only, with segments from specified minimum bits" ) do |n|
|
53
|
+
options[ :tree_bits ] = n
|
54
|
+
end
|
55
|
+
opts.on( "--capacity=N", Integer,
|
56
|
+
"Set initial capacity to N (default: length)" ) do |n|
|
57
|
+
options[ :capacity ] = n
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
oparser.parse!
|
63
|
+
|
64
|
+
tests = case( ARGV.shift )
|
65
|
+
when 'fuzzy'
|
66
|
+
fuzzy_tests( options )
|
67
|
+
when 'gen'
|
68
|
+
gen_test
|
69
|
+
else
|
70
|
+
puts oparser
|
71
|
+
exit 1
|
72
|
+
end
|
73
|
+
|
74
|
+
harness = Gravitext::PerfTest::Harness.new( tests )
|
75
|
+
|
76
|
+
harness.thread_count = options[ :threads ] if options[ :threads ]
|
77
|
+
harness.execute
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.fuzzy_tests( options )
|
81
|
+
length = ARGV.shift.to_i || 1000
|
82
|
+
threshold = ARGV.shift.to_i || 3
|
83
|
+
|
84
|
+
min_bits = options[ :tree_bits ] || 0
|
85
|
+
segbits = [ 0, 4, 8, 16 ].select do |b|
|
86
|
+
( b == 0 || ( ( 64 / b ) > threshold ) ) && ( b >= min_bits )
|
87
|
+
end
|
88
|
+
|
89
|
+
segbits.map do |bits|
|
90
|
+
mode = ( bits == 0 ) ? MD::LIST : MD::TREE
|
91
|
+
pt = FuzzySetPerfTest.new( mode, length, threshold )
|
92
|
+
pt.max_bits = bits if bits > 0
|
93
|
+
pt.initial_capacity = options[ :capacity ] if options[ :capacity ]
|
94
|
+
pt
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.gen_test
|
99
|
+
[ SimHashGenPerfTestFactory.new.perf_test ]
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
PerfTest.run
|
data/config/stopwords.en
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# Common 3-4 character english words to be dropped.
|
2
|
+
# Note that 1-2 character words are already dropped by the Tokenizer.
|
3
|
+
all
|
4
|
+
and
|
5
|
+
are
|
6
|
+
been
|
7
|
+
but
|
8
|
+
can
|
9
|
+
did
|
10
|
+
does
|
11
|
+
for
|
12
|
+
from
|
13
|
+
get
|
14
|
+
had
|
15
|
+
have
|
16
|
+
isn
|
17
|
+
its
|
18
|
+
not
|
19
|
+
one
|
20
|
+
our
|
21
|
+
out
|
22
|
+
said
|
23
|
+
say
|
24
|
+
that
|
25
|
+
the
|
26
|
+
they
|
27
|
+
this
|
28
|
+
was
|
29
|
+
were
|
30
|
+
what
|
31
|
+
when
|
32
|
+
will
|
33
|
+
with
|
34
|
+
you
|
35
|
+
your
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-html'
|
18
|
+
|
19
|
+
require 'iudex-simhash/base.rb'
|
20
|
+
|
21
|
+
require 'java'
|
22
|
+
|
23
|
+
module Iudex
|
24
|
+
module SimHash
|
25
|
+
|
26
|
+
require "iudex-simhash/iudex-simhash-#{VERSION}.jar"
|
27
|
+
|
28
|
+
import 'iudex.simhash.SimHashKeys'
|
29
|
+
|
30
|
+
module BruteFuzzy
|
31
|
+
import 'iudex.simhash.brutefuzzy.BruteFuzzy'
|
32
|
+
import 'iudex.simhash.brutefuzzy.FuzzyList64'
|
33
|
+
import 'iudex.simhash.brutefuzzy.FuzzyTree64'
|
34
|
+
end
|
35
|
+
|
36
|
+
module Filters
|
37
|
+
import 'iudex.simhash.filters.SimHashGenerator'
|
38
|
+
end
|
39
|
+
|
40
|
+
module Gen
|
41
|
+
import 'iudex.simhash.gen.StopWordSet'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module Iudex
|
18
|
+
module SimHash
|
19
|
+
VERSION = '1.0.0'
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-simhash'
|
18
|
+
require 'iudex-filter/key_helper'
|
19
|
+
|
20
|
+
module Iudex
|
21
|
+
module SimHash
|
22
|
+
module Filters
|
23
|
+
module FactoryHelper
|
24
|
+
include Iudex::Core
|
25
|
+
include Iudex::HTML
|
26
|
+
|
27
|
+
DEFAULT_WORDS = File.join( File.dirname( __FILE__ ), '..', '..',
|
28
|
+
'config', 'stopwords.en' )
|
29
|
+
|
30
|
+
def simhash_stopwords( wfile = DEFAULT_WORDS )
|
31
|
+
words = File.open( wfile ) { |fin| fin.readlines }
|
32
|
+
words.map! { |w| w.strip }
|
33
|
+
words.reject! { |w| w =~ /^#/ }
|
34
|
+
|
35
|
+
Gen::StopWordSet.new( words )
|
36
|
+
end
|
37
|
+
|
38
|
+
Element = Java::com.gravitext.xml.tree.Element
|
39
|
+
|
40
|
+
def simhash_generator( input = :simhash_generator_inputs,
|
41
|
+
stopwords = simhash_stopwords )
|
42
|
+
|
43
|
+
inputs = send( input ).map { |r| r.to_a }.map do | key, ratio |
|
44
|
+
key = key.to_k
|
45
|
+
i = if( key.value_type == Element.java_class )
|
46
|
+
SimHashGenerator::Input.forTree( key )
|
47
|
+
else
|
48
|
+
SimHashGenerator::Input.forText( key )
|
49
|
+
end
|
50
|
+
i.wordy_ratio = ratio if ratio
|
51
|
+
i
|
52
|
+
end
|
53
|
+
|
54
|
+
SimHashGenerator.new( inputs, stopwords )
|
55
|
+
end
|
56
|
+
|
57
|
+
def simhash_generator_inputs
|
58
|
+
[ [ :title ],
|
59
|
+
[ :source_tree, 0.30 ] ]
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
Binary file
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-simhash'
|
18
|
+
require 'iudex-simhash/factory_helper'
|
19
|
+
|
20
|
+
class SimHashGenPerfTestFactory
|
21
|
+
include Gravitext::HTMap
|
22
|
+
include Iudex::Core
|
23
|
+
include Iudex::Core::Filters
|
24
|
+
include Iudex::HTML
|
25
|
+
include Iudex::HTML::Filters
|
26
|
+
include Iudex::HTML::Tree
|
27
|
+
include Iudex::HTML::Tree::Filters
|
28
|
+
include Iudex::Filter::Core
|
29
|
+
include Iudex::SimHash::Filters
|
30
|
+
|
31
|
+
include Iudex::SimHash::Filters::FactoryHelper
|
32
|
+
|
33
|
+
import 'iudex.html.HTMLUtils'
|
34
|
+
|
35
|
+
Order = HTMLTreeFilter::Order
|
36
|
+
|
37
|
+
import 'iudex.simhash.filters.SimHashGenPerfTest'
|
38
|
+
|
39
|
+
def initialize
|
40
|
+
UniMap.define_accessors
|
41
|
+
end
|
42
|
+
|
43
|
+
def perf_test
|
44
|
+
|
45
|
+
# Initial parse
|
46
|
+
map = content
|
47
|
+
filter_chain.filter( map )
|
48
|
+
|
49
|
+
SimHashGenPerfTest.new( map, simhash_generator )
|
50
|
+
end
|
51
|
+
|
52
|
+
def content
|
53
|
+
map = UniMap.new
|
54
|
+
|
55
|
+
html = File.read( File.join( File.dirname( __FILE__ ), '..', '..',
|
56
|
+
'test', 'html', 'gentest.html' ) )
|
57
|
+
|
58
|
+
map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
|
59
|
+
map
|
60
|
+
end
|
61
|
+
|
62
|
+
def filter_chain
|
63
|
+
filters = []
|
64
|
+
filters << HTMLParseFilter.new( ContentKeys::SOURCE,
|
65
|
+
nil, HTMLKeys::SOURCE_TREE )
|
66
|
+
filters << TitleExtractor.new
|
67
|
+
filters << TextCtrlWSFilter.new( ContentKeys::TITLE )
|
68
|
+
|
69
|
+
tfc = TreeFilterChain.new( [ MetaSkipFilter.new,
|
70
|
+
CharactersNormalizer.new,
|
71
|
+
WordCounter.new,
|
72
|
+
WordyCounter.new ] )
|
73
|
+
|
74
|
+
filters << HTMLTreeFilter.new( HTMLKeys::SOURCE_TREE,
|
75
|
+
tfc, Order::DEPTH_FIRST )
|
76
|
+
|
77
|
+
FilterChain.new( "perf_test", filters )
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|