iudex-simhash 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
@@ -0,0 +1,16 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ pom.xml
6
+ bin/iudex-simhash-perftest
7
+ config/stopwords.en
8
+ lib/iudex-simhash/base.rb
9
+ lib/iudex-simhash.rb
10
+ lib/iudex-simhash/factory_helper.rb
11
+ lib/iudex-simhash/sim_hash_gen_perf_test_factory.rb
12
+ test/setup.rb
13
+ test/test_fuzzy_set.rb
14
+ test/test_simhash_generator.rb
15
+ test/html/gentest.html
16
+ lib/iudex-simhash/iudex-simhash-1.0.0.jar
@@ -0,0 +1,33 @@
1
+ = iudex-simhash
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-simhash gem contains support for generation and
9
+ searching over simhash fingerprints
10
+
11
+ == Dependencies
12
+
13
+ * Java 1.5+
14
+
15
+ For tests:
16
+
17
+ * JRuby 1.3+
18
+
19
+ == License
20
+
21
+ Copyright (c) 2010-2011 David Kellum
22
+
23
+ Licensed under the Apache License, Version 2.0 (the "License"); you
24
+ may not use this file except in compliance with the License. You
25
+ may obtain a copy of the License at:
26
+
27
+ http://www.apache.org/licenses/LICENSE-2.0
28
+
29
+ Unless required by applicable law or agreed to in writing, software
30
+ distributed under the License is distributed on an "AS IS" BASIS,
31
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
32
+ implied. See the License for the specific language governing
33
+ permissions and limitations under the License.
@@ -0,0 +1,39 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-simhash/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-simhash',
11
+ Iudex::SimHash::VERSION,
12
+ :no_assembly, :java_platform )
13
+
14
+ t.specify do |h|
15
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
+ h.extra_deps += [ [ 'iudex-html', '~> 1.0.0' ] ]
17
+
18
+ h.testlib = :minitest
19
+ h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
20
+ [ 'rjack-logback', '~> 1.0' ] ]
21
+ end
22
+
23
+ file 'Manifest.txt' => [ 'pom.xml' ]
24
+
25
+ task :check_pom_version do
26
+ t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
27
+ end
28
+ task :check_history_version do
29
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
30
+ end
31
+ task :check_history_date do
32
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
33
+ end
34
+
35
+ task :gem => [ :check_pom_version, :check_history_version ]
36
+ task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
37
+ task :push => [ :check_history_date ]
38
+
39
+ t.define_tasks
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #.hashdot.vm.options += -Xmx1g
4
+ # For 64b add: -XX:+UseCompressedOops
5
+
6
+ #--
7
+ # Copyright (c) 2010-2011 David Kellum
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
10
+ # may not use this file except in compliance with the License. You may
11
+ # obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
18
+ # implied. See the License for the specific language governing
19
+ # permissions and limitations under the License.
20
+ #++
21
+
22
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
23
+
24
+ require 'rubygems'
25
+ require 'rjack-logback'
26
+
27
+ require 'iudex-simhash'
28
+ require 'iudex-simhash/sim_hash_gen_perf_test_factory'
29
+
30
+ require 'optparse'
31
+
32
+ require 'gravitext-util'
33
+ require 'gravitext-util/perftest'
34
+
35
+ module PerfTest
36
+ include Iudex::SimHash::BruteFuzzy
37
+ import 'iudex.simhash.brutefuzzy.FuzzySetPerfTest'
38
+
39
+ MD = FuzzySetPerfTest::Mode
40
+
41
+ def self.run( args = ARGV )
42
+
43
+ options = {}
44
+ oparser = OptionParser.new do |opts|
45
+ opts.banner = <<USAGE
46
+ Usage: iudex-simhash-perftest (fuzzy|gen) [options] <length> <threshold-bits>
47
+ USAGE
48
+ opts.on( "-t", "--threads N", Integer, "Test with thread count" ) do |n|
49
+ options[ :threads ] = n
50
+ end
51
+ opts.on( "--tree-bits=N", Integer,
52
+ "Test TREE only, with segments from specified minimum bits" ) do |n|
53
+ options[ :tree_bits ] = n
54
+ end
55
+ opts.on( "--capacity=N", Integer,
56
+ "Set initial capacity to N (default: length)" ) do |n|
57
+ options[ :capacity ] = n
58
+ end
59
+
60
+ end
61
+
62
+ oparser.parse!
63
+
64
+ tests = case( ARGV.shift )
65
+ when 'fuzzy'
66
+ fuzzy_tests( options )
67
+ when 'gen'
68
+ gen_test
69
+ else
70
+ puts oparser
71
+ exit 1
72
+ end
73
+
74
+ harness = Gravitext::PerfTest::Harness.new( tests )
75
+
76
+ harness.thread_count = options[ :threads ] if options[ :threads ]
77
+ harness.execute
78
+ end
79
+
80
+ def self.fuzzy_tests( options )
81
+ length = ARGV.shift.to_i || 1000
82
+ threshold = ARGV.shift.to_i || 3
83
+
84
+ min_bits = options[ :tree_bits ] || 0
85
+ segbits = [ 0, 4, 8, 16 ].select do |b|
86
+ ( b == 0 || ( ( 64 / b ) > threshold ) ) && ( b >= min_bits )
87
+ end
88
+
89
+ segbits.map do |bits|
90
+ mode = ( bits == 0 ) ? MD::LIST : MD::TREE
91
+ pt = FuzzySetPerfTest.new( mode, length, threshold )
92
+ pt.max_bits = bits if bits > 0
93
+ pt.initial_capacity = options[ :capacity ] if options[ :capacity ]
94
+ pt
95
+ end
96
+ end
97
+
98
+ def self.gen_test
99
+ [ SimHashGenPerfTestFactory.new.perf_test ]
100
+ end
101
+
102
+ end
103
+
104
+ PerfTest.run
@@ -0,0 +1,35 @@
1
+ # Common 3-4 character english words to be dropped.
2
+ # Note that 1-2 character words are already dropped by the Tokenizer.
3
+ all
4
+ and
5
+ are
6
+ been
7
+ but
8
+ can
9
+ did
10
+ does
11
+ for
12
+ from
13
+ get
14
+ had
15
+ have
16
+ isn
17
+ its
18
+ not
19
+ one
20
+ our
21
+ out
22
+ said
23
+ say
24
+ that
25
+ the
26
+ they
27
+ this
28
+ was
29
+ were
30
+ what
31
+ when
32
+ will
33
+ with
34
+ you
35
+ your
@@ -0,0 +1,44 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-html'
18
+
19
+ require 'iudex-simhash/base.rb'
20
+
21
+ require 'java'
22
+
23
+ module Iudex
24
+ module SimHash
25
+
26
+ require "iudex-simhash/iudex-simhash-#{VERSION}.jar"
27
+
28
+ import 'iudex.simhash.SimHashKeys'
29
+
30
+ module BruteFuzzy
31
+ import 'iudex.simhash.brutefuzzy.BruteFuzzy'
32
+ import 'iudex.simhash.brutefuzzy.FuzzyList64'
33
+ import 'iudex.simhash.brutefuzzy.FuzzyTree64'
34
+ end
35
+
36
+ module Filters
37
+ import 'iudex.simhash.filters.SimHashGenerator'
38
+ end
39
+
40
+ module Gen
41
+ import 'iudex.simhash.gen.StopWordSet'
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module SimHash
19
+ VERSION = '1.0.0'
20
+ end
21
+ end
@@ -0,0 +1,65 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-simhash'
18
+ require 'iudex-filter/key_helper'
19
+
20
+ module Iudex
21
+ module SimHash
22
+ module Filters
23
+ module FactoryHelper
24
+ include Iudex::Core
25
+ include Iudex::HTML
26
+
27
+ DEFAULT_WORDS = File.join( File.dirname( __FILE__ ), '..', '..',
28
+ 'config', 'stopwords.en' )
29
+
30
+ def simhash_stopwords( wfile = DEFAULT_WORDS )
31
+ words = File.open( wfile ) { |fin| fin.readlines }
32
+ words.map! { |w| w.strip }
33
+ words.reject! { |w| w =~ /^#/ }
34
+
35
+ Gen::StopWordSet.new( words )
36
+ end
37
+
38
+ Element = Java::com.gravitext.xml.tree.Element
39
+
40
+ def simhash_generator( input = :simhash_generator_inputs,
41
+ stopwords = simhash_stopwords )
42
+
43
+ inputs = send( input ).map { |r| r.to_a }.map do | key, ratio |
44
+ key = key.to_k
45
+ i = if( key.value_type == Element.java_class )
46
+ SimHashGenerator::Input.forTree( key )
47
+ else
48
+ SimHashGenerator::Input.forText( key )
49
+ end
50
+ i.wordy_ratio = ratio if ratio
51
+ i
52
+ end
53
+
54
+ SimHashGenerator.new( inputs, stopwords )
55
+ end
56
+
57
+ def simhash_generator_inputs
58
+ [ [ :title ],
59
+ [ :source_tree, 0.30 ] ]
60
+ end
61
+
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,80 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-simhash'
18
+ require 'iudex-simhash/factory_helper'
19
+
20
+ class SimHashGenPerfTestFactory
21
+ include Gravitext::HTMap
22
+ include Iudex::Core
23
+ include Iudex::Core::Filters
24
+ include Iudex::HTML
25
+ include Iudex::HTML::Filters
26
+ include Iudex::HTML::Tree
27
+ include Iudex::HTML::Tree::Filters
28
+ include Iudex::Filter::Core
29
+ include Iudex::SimHash::Filters
30
+
31
+ include Iudex::SimHash::Filters::FactoryHelper
32
+
33
+ import 'iudex.html.HTMLUtils'
34
+
35
+ Order = HTMLTreeFilter::Order
36
+
37
+ import 'iudex.simhash.filters.SimHashGenPerfTest'
38
+
39
+ def initialize
40
+ UniMap.define_accessors
41
+ end
42
+
43
+ def perf_test
44
+
45
+ # Initial parse
46
+ map = content
47
+ filter_chain.filter( map )
48
+
49
+ SimHashGenPerfTest.new( map, simhash_generator )
50
+ end
51
+
52
+ def content
53
+ map = UniMap.new
54
+
55
+ html = File.read( File.join( File.dirname( __FILE__ ), '..', '..',
56
+ 'test', 'html', 'gentest.html' ) )
57
+
58
+ map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
59
+ map
60
+ end
61
+
62
+ def filter_chain
63
+ filters = []
64
+ filters << HTMLParseFilter.new( ContentKeys::SOURCE,
65
+ nil, HTMLKeys::SOURCE_TREE )
66
+ filters << TitleExtractor.new
67
+ filters << TextCtrlWSFilter.new( ContentKeys::TITLE )
68
+
69
+ tfc = TreeFilterChain.new( [ MetaSkipFilter.new,
70
+ CharactersNormalizer.new,
71
+ WordCounter.new,
72
+ WordyCounter.new ] )
73
+
74
+ filters << HTMLTreeFilter.new( HTMLKeys::SOURCE_TREE,
75
+ tfc, Order::DEPTH_FIRST )
76
+
77
+ FilterChain.new( "perf_test", filters )
78
+ end
79
+
80
+ end