iudex-simhash 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
@@ -0,0 +1,16 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ pom.xml
6
+ bin/iudex-simhash-perftest
7
+ config/stopwords.en
8
+ lib/iudex-simhash/base.rb
9
+ lib/iudex-simhash.rb
10
+ lib/iudex-simhash/factory_helper.rb
11
+ lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb
12
+ test/setup.rb
13
+ test/test_fuzzy_set.rb
14
+ test/test_simhash_generator.rb
15
+ test/html/gentest.html
16
+ lib/iudex-simhash/iudex-simhash-1.0.0.jar
@@ -0,0 +1,33 @@
1
+ = iudex-simhash
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-simhash gem contains support for generation and
9
+ searching over simhash fingerprints
10
+
11
+ == Dependencies
12
+
13
+ * Java 1.5+
14
+
15
+ For tests:
16
+
17
+ * JRuby 1.3+
18
+
19
+ == License
20
+
21
+ Copyright (c) 2010-2011 David Kellum
22
+
23
+ Licensed under the Apache License, Version 2.0 (the "License"); you
24
+ may not use this file except in compliance with the License. You
25
+ may obtain a copy of the License at:
26
+
27
+ http://www.apache.org/licenses/LICENSE-2.0
28
+
29
+ Unless required by applicable law or agreed to in writing, software
30
+ distributed under the License is distributed on an "AS IS" BASIS,
31
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
32
+ implied. See the License for the specific language governing
33
+ permissions and limitations under the License.
@@ -0,0 +1,39 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-simhash/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-simhash',
11
+ Iudex::SimHash::VERSION,
12
+ :no_assembly, :java_platform )
13
+
14
+ t.specify do |h|
15
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
+ h.extra_deps += [ [ 'iudex-html', '~> 1.0.0' ] ]
17
+
18
+ h.testlib = :minitest
19
+ h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
20
+ [ 'rjack-logback', '~> 1.0' ] ]
21
+ end
22
+
23
+ file 'Manifest.txt' => [ 'pom.xml' ]
24
+
25
+ task :check_pom_version do
26
+ t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
27
+ end
28
+ task :check_history_version do
29
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
30
+ end
31
+ task :check_history_date do
32
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
33
+ end
34
+
35
+ task :gem => [ :check_pom_version, :check_history_version ]
36
+ task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
37
+ task :push => [ :check_history_date ]
38
+
39
+ t.define_tasks
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #.hashdot.vm.options += -Xmx1g
4
+ # For 64b add: -XX:+UseCompressedOops
5
+
6
+ #--
7
+ # Copyright (c) 2010-2011 David Kellum
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
10
+ # may not use this file except in compliance with the License. You may
11
+ # obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
18
+ # implied. See the License for the specific language governing
19
+ # permissions and limitations under the License.
20
+ #++
21
+
22
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
23
+
24
+ require 'rubygems'
25
+ require 'rjack-logback'
26
+
27
+ require 'iudex-simhash'
28
+ require 'iudex-simhash/sim_hash_gen_perf_test_factory'
29
+
30
+ require 'optparse'
31
+
32
+ require 'gravitext-util'
33
+ require 'gravitext-util/perftest'
34
+
35
+ module PerfTest
36
+ include Iudex::SimHash::BruteFuzzy
37
+ import 'iudex.simhash.brutefuzzy.FuzzySetPerfTest'
38
+
39
+ MD = FuzzySetPerfTest::Mode
40
+
41
+ def self.run( args = ARGV )
42
+
43
+ options = {}
44
+ oparser = OptionParser.new do |opts|
45
+ opts.banner = <<USAGE
46
+ Usage: iudex-simhash-perftest (fuzzy|gen) [options] <length> <threshold-bits>
47
+ USAGE
48
+ opts.on( "-t", "--threads N", Integer, "Test with thread count" ) do |n|
49
+ options[ :threads ] = n
50
+ end
51
+ opts.on( "--tree-bits=N", Integer,
52
+ "Test TREE only, with segments from specified minimum bits" ) do |n|
53
+ options[ :tree_bits ] = n
54
+ end
55
+ opts.on( "--capacity=N", Integer,
56
+ "Set initial capacity to N (default: length)" ) do |n|
57
+ options[ :capacity ] = n
58
+ end
59
+
60
+ end
61
+
62
+ oparser.parse!
63
+
64
+ tests = case( ARGV.shift )
65
+ when 'fuzzy'
66
+ fuzzy_tests( options )
67
+ when 'gen'
68
+ gen_test
69
+ else
70
+ puts oparser
71
+ exit 1
72
+ end
73
+
74
+ harness = Gravitext::PerfTest::Harness.new( tests )
75
+
76
+ harness.thread_count = options[ :threads ] if options[ :threads ]
77
+ harness.execute
78
+ end
79
+
80
+ def self.fuzzy_tests( options )
81
+ length = ARGV.shift.to_i || 1000
82
+ threshold = ARGV.shift.to_i || 3
83
+
84
+ min_bits = options[ :tree_bits ] || 0
85
+ segbits = [ 0, 4, 8, 16 ].select do |b|
86
+ ( b == 0 || ( ( 64 / b ) > threshold ) ) && ( b >= min_bits )
87
+ end
88
+
89
+ segbits.map do |bits|
90
+ mode = ( bits == 0 ) ? MD::LIST : MD::TREE
91
+ pt = FuzzySetPerfTest.new( mode, length, threshold )
92
+ pt.max_bits = bits if bits > 0
93
+ pt.initial_capacity = options[ :capacity ] if options[ :capacity ]
94
+ pt
95
+ end
96
+ end
97
+
98
+ def self.gen_test
99
+ [ SimHashGenPerfTestFactory.new.perf_test ]
100
+ end
101
+
102
+ end
103
+
104
+ PerfTest.run
@@ -0,0 +1,35 @@
1
+ # Common 3-4 character english words to be dropped.
2
+ # Note that 1-2 character words are already dropped by the Tokenizer.
3
+ all
4
+ and
5
+ are
6
+ been
7
+ but
8
+ can
9
+ did
10
+ does
11
+ for
12
+ from
13
+ get
14
+ had
15
+ have
16
+ isn
17
+ its
18
+ not
19
+ one
20
+ our
21
+ out
22
+ said
23
+ say
24
+ that
25
+ the
26
+ they
27
+ this
28
+ was
29
+ were
30
+ what
31
+ when
32
+ will
33
+ with
34
+ you
35
+ your
@@ -0,0 +1,44 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-html'
18
+
19
+ require 'iudex-simhash/base.rb'
20
+
21
+ require 'java'
22
+
23
+ module Iudex
24
+ module SimHash
25
+
26
+ require "iudex-simhash/iudex-simhash-#{VERSION}.jar"
27
+
28
+ import 'iudex.simhash.SimHashKeys'
29
+
30
+ module BruteFuzzy
31
+ import 'iudex.simhash.brutefuzzy.BruteFuzzy'
32
+ import 'iudex.simhash.brutefuzzy.FuzzyList64'
33
+ import 'iudex.simhash.brutefuzzy.FuzzyTree64'
34
+ end
35
+
36
+ module Filters
37
+ import 'iudex.simhash.filters.SimHashGenerator'
38
+ end
39
+
40
+ module Gen
41
+ import 'iudex.simhash.gen.StopWordSet'
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module SimHash
19
+ VERSION = '1.0.0'
20
+ end
21
+ end
@@ -0,0 +1,65 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-simhash'
18
+ require 'iudex-filter/key_helper'
19
+
20
+ module Iudex
21
+ module SimHash
22
+ module Filters
23
+ module FactoryHelper
24
+ include Iudex::Core
25
+ include Iudex::HTML
26
+
27
+ DEFAULT_WORDS = File.join( File.dirname( __FILE__ ), '..', '..',
28
+ 'config', 'stopwords.en' )
29
+
30
+ def simhash_stopwords( wfile = DEFAULT_WORDS )
31
+ words = File.open( wfile ) { |fin| fin.readlines }
32
+ words.map! { |w| w.strip }
33
+ words.reject! { |w| w =~ /^#/ }
34
+
35
+ Gen::StopWordSet.new( words )
36
+ end
37
+
38
+ Element = Java::com.gravitext.xml.tree.Element
39
+
40
+ def simhash_generator( input = :simhash_generator_inputs,
41
+ stopwords = simhash_stopwords )
42
+
43
+ inputs = send( input ).map { |r| r.to_a }.map do | key, ratio |
44
+ key = key.to_k
45
+ i = if( key.value_type == Element.java_class )
46
+ SimHashGenerator::Input.forTree( key )
47
+ else
48
+ SimHashGenerator::Input.forText( key )
49
+ end
50
+ i.wordy_ratio = ratio if ratio
51
+ i
52
+ end
53
+
54
+ SimHashGenerator.new( inputs, stopwords )
55
+ end
56
+
57
+ def simhash_generator_inputs
58
+ [ [ :title ],
59
+ [ :source_tree, 0.30 ] ]
60
+ end
61
+
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,80 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-simhash'
18
+ require 'iudex-simhash/factory_helper'
19
+
20
+ class SimHashGenPerfTestFactory
21
+ include Gravitext::HTMap
22
+ include Iudex::Core
23
+ include Iudex::Core::Filters
24
+ include Iudex::HTML
25
+ include Iudex::HTML::Filters
26
+ include Iudex::HTML::Tree
27
+ include Iudex::HTML::Tree::Filters
28
+ include Iudex::Filter::Core
29
+ include Iudex::SimHash::Filters
30
+
31
+ include Iudex::SimHash::Filters::FactoryHelper
32
+
33
+ import 'iudex.html.HTMLUtils'
34
+
35
+ Order = HTMLTreeFilter::Order
36
+
37
+ import 'iudex.simhash.filters.SimHashGenPerfTest'
38
+
39
+ def initialize
40
+ UniMap.define_accessors
41
+ end
42
+
43
+ def perf_test
44
+
45
+ # Initial parse
46
+ map = content
47
+ filter_chain.filter( map )
48
+
49
+ SimHashGenPerfTest.new( map, simhash_generator )
50
+ end
51
+
52
+ def content
53
+ map = UniMap.new
54
+
55
+ html = File.read( File.join( File.dirname( __FILE__ ), '..', '..',
56
+ 'test', 'html', 'gentest.html' ) )
57
+
58
+ map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
59
+ map
60
+ end
61
+
62
+ def filter_chain
63
+ filters = []
64
+ filters << HTMLParseFilter.new( ContentKeys::SOURCE,
65
+ nil, HTMLKeys::SOURCE_TREE )
66
+ filters << TitleExtractor.new
67
+ filters << TextCtrlWSFilter.new( ContentKeys::TITLE )
68
+
69
+ tfc = TreeFilterChain.new( [ MetaSkipFilter.new,
70
+ CharactersNormalizer.new,
71
+ WordCounter.new,
72
+ WordyCounter.new ] )
73
+
74
+ filters << HTMLTreeFilter.new( HTMLKeys::SOURCE_TREE,
75
+ tfc, Order::DEPTH_FIRST )
76
+
77
+ FilterChain.new( "perf_test", filters )
78
+ end
79
+
80
+ end