iudex-simhash 1.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +2 -0
- data/Manifest.txt +16 -0
- data/README.rdoc +33 -0
- data/Rakefile +39 -0
- data/bin/iudex-simhash-perftest +104 -0
- data/config/stopwords.en +35 -0
- data/lib/iudex-simhash.rb +44 -0
- data/lib/iudex-simhash/base.rb +21 -0
- data/lib/iudex-simhash/factory_helper.rb +65 -0
- data/lib/iudex-simhash/iudex-simhash-1.0.0.jar +0 -0
- data/lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb +80 -0
- data/pom.xml +44 -0
- data/test/html/gentest.html +1447 -0
- data/test/setup.rb +35 -0
- data/test/test_fuzzy_set.rb +144 -0
- data/test/test_simhash_generator.rb +103 -0
- metadata +125 -0
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
History.rdoc
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
pom.xml
|
6
|
+
bin/iudex-simhash-perftest
|
7
|
+
config/stopwords.en
|
8
|
+
lib/iudex-simhash/base.rb
|
9
|
+
lib/iudex-simhash.rb
|
10
|
+
lib/iudex-simhash/factory_helper.rb
|
11
|
+
lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb
|
12
|
+
test/setup.rb
|
13
|
+
test/test_fuzzy_set.rb
|
14
|
+
test/test_simhash_generator.rb
|
15
|
+
test/html/gentest.html
|
16
|
+
lib/iudex-simhash/iudex-simhash-1.0.0.jar
|
data/README.rdoc
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
= iudex-simhash
|
2
|
+
|
3
|
+
* http://github.com/dekellum/iudex
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
Iudex is a general purpose web crawler and feed processor in
|
8
|
+
ruby/java. The iudex-simhash gem contains support for generation and
|
9
|
+
searching over simhash fingerprints
|
10
|
+
|
11
|
+
== Dependencies
|
12
|
+
|
13
|
+
* Java 1.5+
|
14
|
+
|
15
|
+
For tests:
|
16
|
+
|
17
|
+
* JRuby 1.3+
|
18
|
+
|
19
|
+
== License
|
20
|
+
|
21
|
+
Copyright (c) 2010-2011 David Kellum
|
22
|
+
|
23
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
24
|
+
may not use this file except in compliance with the License. You
|
25
|
+
may obtain a copy of the License at:
|
26
|
+
|
27
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
28
|
+
|
29
|
+
Unless required by applicable law or agreed to in writing, software
|
30
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
31
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
32
|
+
implied. See the License for the specific language governing
|
33
|
+
permissions and limitations under the License.
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
$LOAD_PATH << './lib'
|
4
|
+
require 'iudex-simhash/base'
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
gem 'rjack-tarpit', '~> 1.2'
|
8
|
+
require 'rjack-tarpit'
|
9
|
+
|
10
|
+
t = RJack::TarPit.new( 'iudex-simhash',
|
11
|
+
Iudex::SimHash::VERSION,
|
12
|
+
:no_assembly, :java_platform )
|
13
|
+
|
14
|
+
t.specify do |h|
|
15
|
+
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
16
|
+
h.extra_deps += [ [ 'iudex-html', '~> 1.0.0' ] ]
|
17
|
+
|
18
|
+
h.testlib = :minitest
|
19
|
+
h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
|
20
|
+
[ 'rjack-logback', '~> 1.0' ] ]
|
21
|
+
end
|
22
|
+
|
23
|
+
file 'Manifest.txt' => [ 'pom.xml' ]
|
24
|
+
|
25
|
+
task :check_pom_version do
|
26
|
+
t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
|
27
|
+
end
|
28
|
+
task :check_history_version do
|
29
|
+
t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
|
30
|
+
end
|
31
|
+
task :check_history_date do
|
32
|
+
t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
|
33
|
+
end
|
34
|
+
|
35
|
+
task :gem => [ :check_pom_version, :check_history_version ]
|
36
|
+
task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
|
37
|
+
task :push => [ :check_history_date ]
|
38
|
+
|
39
|
+
t.define_tasks
|
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
#.hashdot.vm.options += -Xmx1g
|
4
|
+
# For 64b add: -XX:+UseCompressedOops
|
5
|
+
|
6
|
+
#--
|
7
|
+
# Copyright (c) 2010-2011 David Kellum
|
8
|
+
#
|
9
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
10
|
+
# may not use this file except in compliance with the License. You may
|
11
|
+
# obtain a copy of the License at
|
12
|
+
#
|
13
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14
|
+
#
|
15
|
+
# Unless required by applicable law or agreed to in writing, software
|
16
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
18
|
+
# implied. See the License for the specific language governing
|
19
|
+
# permissions and limitations under the License.
|
20
|
+
#++
|
21
|
+
|
22
|
+
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
23
|
+
|
24
|
+
require 'rubygems'
|
25
|
+
require 'rjack-logback'
|
26
|
+
|
27
|
+
require 'iudex-simhash'
|
28
|
+
require 'iudex-simhash/sim_hash_gen_perf_test_factory'
|
29
|
+
|
30
|
+
require 'optparse'
|
31
|
+
|
32
|
+
require 'gravitext-util'
|
33
|
+
require 'gravitext-util/perftest'
|
34
|
+
|
35
|
+
module PerfTest
|
36
|
+
include Iudex::SimHash::BruteFuzzy
|
37
|
+
import 'iudex.simhash.brutefuzzy.FuzzySetPerfTest'
|
38
|
+
|
39
|
+
MD = FuzzySetPerfTest::Mode
|
40
|
+
|
41
|
+
def self.run( args = ARGV )
|
42
|
+
|
43
|
+
options = {}
|
44
|
+
oparser = OptionParser.new do |opts|
|
45
|
+
opts.banner = <<USAGE
|
46
|
+
Usage: iudex-simhash-perftest (fuzzy|gen) [options] <length> <threshold-bits>
|
47
|
+
USAGE
|
48
|
+
opts.on( "-t", "--threads N", Integer, "Test with thread count" ) do |n|
|
49
|
+
options[ :threads ] = n
|
50
|
+
end
|
51
|
+
opts.on( "--tree-bits=N", Integer,
|
52
|
+
"Test TREE only, with segments from specified minimum bits" ) do |n|
|
53
|
+
options[ :tree_bits ] = n
|
54
|
+
end
|
55
|
+
opts.on( "--capacity=N", Integer,
|
56
|
+
"Set initial capacity to N (default: length)" ) do |n|
|
57
|
+
options[ :capacity ] = n
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
oparser.parse!
|
63
|
+
|
64
|
+
tests = case( ARGV.shift )
|
65
|
+
when 'fuzzy'
|
66
|
+
fuzzy_tests( options )
|
67
|
+
when 'gen'
|
68
|
+
gen_test
|
69
|
+
else
|
70
|
+
puts oparser
|
71
|
+
exit 1
|
72
|
+
end
|
73
|
+
|
74
|
+
harness = Gravitext::PerfTest::Harness.new( tests )
|
75
|
+
|
76
|
+
harness.thread_count = options[ :threads ] if options[ :threads ]
|
77
|
+
harness.execute
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.fuzzy_tests( options )
|
81
|
+
length = ARGV.shift.to_i || 1000
|
82
|
+
threshold = ARGV.shift.to_i || 3
|
83
|
+
|
84
|
+
min_bits = options[ :tree_bits ] || 0
|
85
|
+
segbits = [ 0, 4, 8, 16 ].select do |b|
|
86
|
+
( b == 0 || ( ( 64 / b ) > threshold ) ) && ( b >= min_bits )
|
87
|
+
end
|
88
|
+
|
89
|
+
segbits.map do |bits|
|
90
|
+
mode = ( bits == 0 ) ? MD::LIST : MD::TREE
|
91
|
+
pt = FuzzySetPerfTest.new( mode, length, threshold )
|
92
|
+
pt.max_bits = bits if bits > 0
|
93
|
+
pt.initial_capacity = options[ :capacity ] if options[ :capacity ]
|
94
|
+
pt
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.gen_test
|
99
|
+
[ SimHashGenPerfTestFactory.new.perf_test ]
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
PerfTest.run
|
data/config/stopwords.en
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# Common 3-4 character english words to be dropped.
|
2
|
+
# Note that 1-2 character words are already dropped by the Tokenizer.
|
3
|
+
all
|
4
|
+
and
|
5
|
+
are
|
6
|
+
been
|
7
|
+
but
|
8
|
+
can
|
9
|
+
did
|
10
|
+
does
|
11
|
+
for
|
12
|
+
from
|
13
|
+
get
|
14
|
+
had
|
15
|
+
have
|
16
|
+
isn
|
17
|
+
its
|
18
|
+
not
|
19
|
+
one
|
20
|
+
our
|
21
|
+
out
|
22
|
+
said
|
23
|
+
say
|
24
|
+
that
|
25
|
+
the
|
26
|
+
they
|
27
|
+
this
|
28
|
+
was
|
29
|
+
were
|
30
|
+
what
|
31
|
+
when
|
32
|
+
will
|
33
|
+
with
|
34
|
+
you
|
35
|
+
your
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-html'
|
18
|
+
|
19
|
+
require 'iudex-simhash/base.rb'
|
20
|
+
|
21
|
+
require 'java'
|
22
|
+
|
23
|
+
module Iudex
|
24
|
+
module SimHash
|
25
|
+
|
26
|
+
require "iudex-simhash/iudex-simhash-#{VERSION}.jar"
|
27
|
+
|
28
|
+
import 'iudex.simhash.SimHashKeys'
|
29
|
+
|
30
|
+
module BruteFuzzy
|
31
|
+
import 'iudex.simhash.brutefuzzy.BruteFuzzy'
|
32
|
+
import 'iudex.simhash.brutefuzzy.FuzzyList64'
|
33
|
+
import 'iudex.simhash.brutefuzzy.FuzzyTree64'
|
34
|
+
end
|
35
|
+
|
36
|
+
module Filters
|
37
|
+
import 'iudex.simhash.filters.SimHashGenerator'
|
38
|
+
end
|
39
|
+
|
40
|
+
module Gen
|
41
|
+
import 'iudex.simhash.gen.StopWordSet'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module Iudex
|
18
|
+
module SimHash
|
19
|
+
VERSION = '1.0.0'
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-simhash'
|
18
|
+
require 'iudex-filter/key_helper'
|
19
|
+
|
20
|
+
module Iudex
|
21
|
+
module SimHash
|
22
|
+
module Filters
|
23
|
+
module FactoryHelper
|
24
|
+
include Iudex::Core
|
25
|
+
include Iudex::HTML
|
26
|
+
|
27
|
+
DEFAULT_WORDS = File.join( File.dirname( __FILE__ ), '..', '..',
|
28
|
+
'config', 'stopwords.en' )
|
29
|
+
|
30
|
+
def simhash_stopwords( wfile = DEFAULT_WORDS )
|
31
|
+
words = File.open( wfile ) { |fin| fin.readlines }
|
32
|
+
words.map! { |w| w.strip }
|
33
|
+
words.reject! { |w| w =~ /^#/ }
|
34
|
+
|
35
|
+
Gen::StopWordSet.new( words )
|
36
|
+
end
|
37
|
+
|
38
|
+
Element = Java::com.gravitext.xml.tree.Element
|
39
|
+
|
40
|
+
def simhash_generator( input = :simhash_generator_inputs,
|
41
|
+
stopwords = simhash_stopwords )
|
42
|
+
|
43
|
+
inputs = send( input ).map { |r| r.to_a }.map do | key, ratio |
|
44
|
+
key = key.to_k
|
45
|
+
i = if( key.value_type == Element.java_class )
|
46
|
+
SimHashGenerator::Input.forTree( key )
|
47
|
+
else
|
48
|
+
SimHashGenerator::Input.forText( key )
|
49
|
+
end
|
50
|
+
i.wordy_ratio = ratio if ratio
|
51
|
+
i
|
52
|
+
end
|
53
|
+
|
54
|
+
SimHashGenerator.new( inputs, stopwords )
|
55
|
+
end
|
56
|
+
|
57
|
+
def simhash_generator_inputs
|
58
|
+
[ [ :title ],
|
59
|
+
[ :source_tree, 0.30 ] ]
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
Binary file
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-simhash'
|
18
|
+
require 'iudex-simhash/factory_helper'
|
19
|
+
|
20
|
+
class SimHashGenPerfTestFactory
|
21
|
+
include Gravitext::HTMap
|
22
|
+
include Iudex::Core
|
23
|
+
include Iudex::Core::Filters
|
24
|
+
include Iudex::HTML
|
25
|
+
include Iudex::HTML::Filters
|
26
|
+
include Iudex::HTML::Tree
|
27
|
+
include Iudex::HTML::Tree::Filters
|
28
|
+
include Iudex::Filter::Core
|
29
|
+
include Iudex::SimHash::Filters
|
30
|
+
|
31
|
+
include Iudex::SimHash::Filters::FactoryHelper
|
32
|
+
|
33
|
+
import 'iudex.html.HTMLUtils'
|
34
|
+
|
35
|
+
Order = HTMLTreeFilter::Order
|
36
|
+
|
37
|
+
import 'iudex.simhash.filters.SimHashGenPerfTest'
|
38
|
+
|
39
|
+
def initialize
|
40
|
+
UniMap.define_accessors
|
41
|
+
end
|
42
|
+
|
43
|
+
def perf_test
|
44
|
+
|
45
|
+
# Initial parse
|
46
|
+
map = content
|
47
|
+
filter_chain.filter( map )
|
48
|
+
|
49
|
+
SimHashGenPerfTest.new( map, simhash_generator )
|
50
|
+
end
|
51
|
+
|
52
|
+
def content
|
53
|
+
map = UniMap.new
|
54
|
+
|
55
|
+
html = File.read( File.join( File.dirname( __FILE__ ), '..', '..',
|
56
|
+
'test', 'html', 'gentest.html' ) )
|
57
|
+
|
58
|
+
map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
|
59
|
+
map
|
60
|
+
end
|
61
|
+
|
62
|
+
def filter_chain
|
63
|
+
filters = []
|
64
|
+
filters << HTMLParseFilter.new( ContentKeys::SOURCE,
|
65
|
+
nil, HTMLKeys::SOURCE_TREE )
|
66
|
+
filters << TitleExtractor.new
|
67
|
+
filters << TextCtrlWSFilter.new( ContentKeys::TITLE )
|
68
|
+
|
69
|
+
tfc = TreeFilterChain.new( [ MetaSkipFilter.new,
|
70
|
+
CharactersNormalizer.new,
|
71
|
+
WordCounter.new,
|
72
|
+
WordyCounter.new ] )
|
73
|
+
|
74
|
+
filters << HTMLTreeFilter.new( HTMLKeys::SOURCE_TREE,
|
75
|
+
tfc, Order::DEPTH_FIRST )
|
76
|
+
|
77
|
+
FilterChain.new( "perf_test", filters )
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|