iudex-worker 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,17 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ bin/iudex-worker-fg
6
+ config/config.rb
7
+ init/iudex-worker
8
+ lib/iudex-worker/base.rb
9
+ lib/iudex-worker.rb
10
+ lib/iudex-worker/agent.rb
11
+ lib/iudex-worker/fetch_helper.rb
12
+ lib/iudex-worker/filter_chain_factory.rb
13
+ lib/iudex-worker/prioritizer.rb
14
+ test/setup.rb
15
+ test/test_agent.rb
16
+ test/test_filter_chain_factory.rb
17
+ test/test_prioritizer.rb
data/README.rdoc ADDED
@@ -0,0 +1,25 @@
1
+ = iudex-worker
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-worker gem provides a worker deamon for feed/page
9
+ processing.
10
+
11
+ == License
12
+
13
+ Copyright (c) 2008-2011 David Kellum
14
+
15
+ Licensed under the Apache License, Version 2.0 (the "License"); you
16
+ may not use this file except in compliance with the License. You may
17
+ obtain a copy of the License at
18
+
19
+ http://www.apache.org/licenses/LICENSE-2.0
20
+
21
+ Unless required by applicable law or agreed to in writing, software
22
+ distributed under the License is distributed on an "AS IS" BASIS,
23
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24
+ implied. See the License for the specific language governing
25
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,41 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-worker/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-worker', Iudex::Worker::VERSION, :java_platform )
11
+
12
+ t.specify do |h|
13
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
14
+
15
+ h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
16
+ [ 'rjack-logback', '~> 1.0' ],
17
+ [ 'iudex-da', '~> 1.0.0' ],
18
+ [ 'iudex-rome', '~> 1.0.0' ],
19
+ [ 'iudex-html', '~> 1.0.0' ],
20
+ [ 'iudex-simhash', '~> 1.0.0' ],
21
+ [ 'iudex-httpclient-3', '~> 1.0.0' ] ]
22
+
23
+ h.testlib = :minitest
24
+ h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ] ]
25
+ end
26
+
27
+ task :chk_hist_vers do
28
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
29
+ end
30
+ task :chk_init_v do
31
+ t.test_line_match( 'init/iudex-worker', /^gem.+#{t.name}/, /= #{t.version}/ )
32
+ end
33
+ task :chk_hist_date do
34
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
35
+ end
36
+
37
+ task :gem => [ :chk_hist_vers, :chk_init_v ]
38
+ task :tag => [ :chk_hist_vers, :chk_init_v, :chk_hist_date ]
39
+ task :push => [ :chk_hist_date ]
40
+
41
+ t.define_tasks
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
21
+
22
+ require 'optparse'
23
+
24
+ module IudexBinScript
25
+
26
+ require 'rubygems'
27
+ require 'rjack-logback'
28
+
29
+ include RJack
30
+ Logback.config_console( :mdc => "uhash", :thread => true )
31
+
32
+ require 'iudex-worker'
33
+ include Iudex
34
+
35
+ Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
36
+
37
+ OptionParser.new do |opts|
38
+ opts.on( "-v", "--version", "Display version" ) do |file|
39
+ puts "iudex-worker: #{ Worker::VERSION }"
40
+ exit 1
41
+ end
42
+ opts.on( "-d", "--debug", "Enable verbose DEBUG logging" ) do |file|
43
+ Logback[ 'iudex' ].level = Logback::DEBUG
44
+ end
45
+ Hooker.register_config( opts )
46
+ end.parse!
47
+
48
+ Worker::Agent.new.run
49
+
50
+ end
data/config/config.rb ADDED
@@ -0,0 +1,46 @@
1
+
2
+ RJack::Logback[ 'iudex.filter.core.FilterChain.agent' ].level =
3
+ RJack::Logback::DEBUG
4
+
5
+ Iudex.configure do |c|
6
+
7
+ threads = 3
8
+
9
+ c.setup_connect_props do
10
+ { :ds_pool => { :max_active => threads / 3 * 2,
11
+ :max_idle => threads / 3 },
12
+ :loglevel => 1 }
13
+ end
14
+
15
+ c.setup_http_client_3 do |mgr|
16
+ mgr.manager_params.max_total_connections = threads * 10
17
+ end
18
+
19
+ c.setup_visit_executor do |vx|
20
+ vx.max_threads = threads
21
+ vx.min_host_delay = 100 #ms
22
+ end
23
+
24
+ c.setup_work_poller do |wp|
25
+ wp.min_order_remaining_ratio = 0.30
26
+ wp.max_check_interval = 100 #ms
27
+ wp.min_poll_interval = 2_000 #ms
28
+ end
29
+
30
+ c.setup_filter_factory do |ff|
31
+
32
+ def ff.barc_writer
33
+ bw = super
34
+ bw.do_compress = false
35
+ bw
36
+ end
37
+
38
+ def ff.barc_directory
39
+ bdir = super
40
+ bdir.target_length = 2 * ( 1024 ** 2 )
41
+ bdir
42
+ end
43
+
44
+ end
45
+
46
+ end
data/init/iudex-worker ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #. hashdot.profile += daemon
4
+ #. hashdot.pid_file = ./iudex-worker.pid
5
+ #. hashdot.io_redirect.file = ./iudex-worker.log
6
+ #. hashdot.vm.options += -Xmx1g
7
+ #. hashdot.vm.options += -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled
8
+
9
+ #--
10
+ # Copyright (c) 2008-2011 David Kellum
11
+ #
12
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
13
+ # may not use this file except in compliance with the License. You
14
+ # may obtain a copy of the License at
15
+ #
16
+ # http://www.apache.org/licenses/LICENSE-2.0
17
+ #
18
+ # Unless required by applicable law or agreed to in writing, software
19
+ # distributed under the License is distributed on an "AS IS" BASIS,
20
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
21
+ # implied. See the License for the specific language governing
22
+ # permissions and limitations under the License.
23
+ #++
24
+
25
+ require 'rubygems'
26
+
27
+ gem( "iudex-worker", "= 1.0.0" )
28
+
29
+ module IudexInitScript
30
+
31
+ require 'rjack-logback'
32
+ include RJack
33
+ Logback.config_console( :full => true, :thread => true, :mdc => "uhash" )
34
+
35
+ require 'iudex-worker'
36
+ include Iudex
37
+
38
+ Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
39
+
40
+ if File.exist?( './config.rb' )
41
+ Hooker.load_file( './config.rb' )
42
+ end
43
+
44
+ Worker::Agent.new.run
45
+
46
+ end
@@ -0,0 +1,87 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-da'
18
+ require 'iudex-da/key_helper'
19
+ require 'iudex-da/pool_data_source_factory'
20
+
21
+ require 'iudex-httpclient-3'
22
+
23
+ require 'iudex-worker'
24
+ require 'iudex-worker/filter_chain_factory'
25
+
26
+ require 'hooker'
27
+
28
+ module Iudex
29
+ module Worker
30
+
31
+ class Agent
32
+ include Iudex::DA
33
+ include Iudex::Filter::KeyHelper
34
+ include Iudex::Core
35
+ include Iudex::Worker
36
+ include Gravitext::HTMap
37
+
38
+ def initialize
39
+ Hooker.apply( [ :iudex, :worker ], self )
40
+ end
41
+
42
+ def poll_keys
43
+ [ :url, :type, :priority, :next_visit_after, :last_visit, :etag ]
44
+ end
45
+
46
+ # Note this can/is used to override factory in derived classes.
47
+ def filter_chain_factory
48
+ FilterChainFactory.new( 'agent' )
49
+ end
50
+
51
+ def run
52
+ Hooker.with( :iudex ) do
53
+ dsf = PoolDataSourceFactory.new
54
+ data_source = dsf.create
55
+
56
+ cmapper = ContentMapper.new( keys( poll_keys ) )
57
+ wpoller = WorkPoller.new( data_source, cmapper )
58
+ Hooker.apply( :work_poller, wpoller )
59
+
60
+ mgr = HTTPClient3.create_manager
61
+ mgr.start
62
+ http_client = HTTPClient3::HTTPClient3.new( mgr.client )
63
+
64
+ fcf = filter_chain_factory
65
+ fcf.http_client = http_client
66
+ fcf.data_source = data_source
67
+
68
+ Hooker.apply( :filter_factory, fcf )
69
+
70
+ fcf.filter do |chain|
71
+ vexec = VisitExecutor.new( chain, wpoller )
72
+ Hooker.apply( :visit_executor, vexec )
73
+
74
+ Hooker.log_not_applied # All hooks should be used by now
75
+
76
+ vexec.start
77
+ vexec.join #Run until interrupted
78
+ end # fcf closes
79
+
80
+ mgr.shutdown
81
+ dsf.close
82
+ end
83
+ end
84
+ end
85
+
86
+ end
87
+ end
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module Worker
19
+ VERSION = '1.0.0'
20
+ end
21
+ end
@@ -0,0 +1,79 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-worker'
18
+
19
+ module Iudex
20
+ module Worker
21
+
22
+ module FetchHelper
23
+ include Iudex::HTTP
24
+ include Iudex::Core::Filters
25
+
26
+ def create_content_fetcher( accept_types, receiver_sym )
27
+ cf = ContentFetcher.new( http_client, create_chain( receiver_sym ) )
28
+
29
+ alist = accept_list( accept_types )
30
+ cf.accepted_content_types = alist unless alist.include?( '*/*' )
31
+
32
+ headers = [ [ 'User-Agent', http_user_agent ],
33
+ [ 'Accept', accept_header( accept_types ) ] ]
34
+
35
+ cf.request_headers = headers.map { |kv| Header.new( *kv ) }
36
+
37
+ cf
38
+ end
39
+
40
+ def http_user_agent
41
+ ( "Mozilla/5.0 (compatible; " +
42
+ "Iudex #{Iudex::Worker::VERSION}; " +
43
+ "+http://gravitext.com/iudex)" )
44
+ end
45
+
46
+ def feed_mime_types
47
+ # List of accepted mime types grouped and order in descending
48
+ # order of preference.
49
+ [ %w[ application/atom+xml application/rss+xml ],
50
+ %w[ application/rdf+xml application/xml ],
51
+ %w[ text/xml ],
52
+ %w[ text/* ],
53
+ %w[ */* ] ]
54
+ end
55
+
56
+ def page_mime_types
57
+ [ %w[ application/xhtml+xml text/html ],
58
+ %w[ application/xml ],
59
+ %w[ text/* ] ]
60
+ end
61
+
62
+ def accept_header( types )
63
+ q = 1.0
64
+ ts = types.map do |tgrp|
65
+ tgrp = tgrp.map { |m| "#{m};q=#{q}" } if q < 1.0
66
+ q -= 0.1
67
+ tgrp
68
+ end
69
+ ts.flatten.join( ',' )
70
+ end
71
+
72
+ def accept_list( types )
73
+ types.flatten
74
+ end
75
+
76
+ end
77
+
78
+ end
79
+ end
@@ -0,0 +1,201 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-filter'
18
+ require 'iudex-filter/filter_chain_factory'
19
+
20
+ require 'iudex-barc'
21
+
22
+ require 'iudex-core'
23
+
24
+ require 'iudex-da'
25
+ require 'iudex-da/factory_helper'
26
+
27
+ require 'iudex-rome'
28
+
29
+ require 'iudex-html'
30
+ require 'iudex-html/factory_helper'
31
+
32
+ require 'iudex-simhash'
33
+ require 'iudex-simhash/factory_helper'
34
+
35
+ require 'iudex-worker'
36
+ require 'iudex-worker/fetch_helper'
37
+ require 'iudex-worker/prioritizer'
38
+
39
+ module Iudex
40
+ module Worker
41
+
42
+ class FilterChainFactory < Iudex::Filter::Core::FilterChainFactory
43
+ include Iudex::Filter::Core
44
+ include Iudex::BARC
45
+ include Iudex::Core
46
+ include Iudex::Core::Filters
47
+ include Iudex::ROME
48
+
49
+ include Iudex::DA::Filters::FactoryHelper
50
+ include Iudex::HTML::Filters::FactoryHelper
51
+ include Iudex::SimHash::Filters::FactoryHelper
52
+ include FetchHelper
53
+
54
+ attr_accessor :http_client
55
+ attr_accessor :data_source
56
+
57
+ def initialize( name )
58
+ super
59
+ setup_reporters
60
+ end
61
+
62
+ def setup_reporters
63
+ add_summary_reporter
64
+ add_by_filter_reporter
65
+ end
66
+
67
+ def filters
68
+ [ UHashMDCSetter.new,
69
+ DefaultFilter.new,
70
+ super,
71
+ type_switch ].flatten
72
+ end
73
+
74
+ def listeners
75
+ super + [ MDCUnsetter.new( "uhash" ) ]
76
+ end
77
+
78
+ def type_map
79
+ { "FEED" => feed_fetcher,
80
+ "PAGE" => page_fetcher }
81
+ end
82
+
83
+ def type_switch( tmap = type_map )
84
+ create_switch( :type.to_k, tmap )
85
+ end
86
+
87
+ def feed_fetcher
88
+ [ create_content_fetcher( feed_mime_types, :feed_receiver ) ]
89
+ end
90
+
91
+ def page_fetcher
92
+ [ create_content_fetcher( page_mime_types, :page_receiver ) ]
93
+ end
94
+
95
+ def feed_receiver
96
+ [ RomeFeedParser.new,
97
+ DefaultFilter.new,
98
+ DateChangeFilter.new( false ),
99
+ feed_updater ]
100
+ end
101
+
102
+ def feed_updater
103
+ create_update_filter( keys( feed_update_keys ),
104
+ :feed_post, :feed_ref_update, :feed_ref_new )
105
+ end
106
+
107
+ def feed_ref_new
108
+ [ UHashMDCSetter.new,
109
+ ref_common_cleanup,
110
+ Prioritizer.new( "feed-ref-new",
111
+ :constant => 50,
112
+ :min_next => 0.0 ) ].flatten
113
+ end
114
+
115
+ def feed_ref_update
116
+ [ UHashMDCSetter.new,
117
+ DateChangeFilter.new( true ),
118
+ ref_common_cleanup,
119
+ Prioritizer.new( "feed-ref-update",
120
+ :constant => 10,
121
+ :min_next => 0.0 ) ].flatten
122
+ end
123
+
124
+ # Note: *_post is run possibly twice, once for both base content
125
+ # map and referer map.
126
+ def feed_post
127
+ [ UHashMDCSetter.new,
128
+ ref_common_cleanup,
129
+ Prioritizer.new( "feed-post",
130
+ :constant => 30,
131
+ :visiting_now => true ),
132
+ last_visit_setter ].flatten
133
+ end
134
+
135
+ def ref_common_cleanup
136
+ [ ref_html_filters,
137
+ TextCtrlWSFilter.new( :title.to_k ),
138
+ FutureDateFilter.new( :pub_date.to_k ) ].flatten
139
+ end
140
+
141
+ def ref_html_filters
142
+ [ html_clean_filters( :title ),
143
+ html_clean_filters( :summary ),
144
+ html_clean_filters( :content ),
145
+ html_write_filter( :summary ),
146
+ html_write_filter( :content ) ].flatten
147
+ end
148
+
149
+ def feed_update_keys
150
+ page_update_keys + [ :title, :summary, :content ]
151
+ end
152
+
153
+ def page_receiver
154
+ [ html_clean_filters( :source ),
155
+ simhash_generator,
156
+ page_updater ].flatten
157
+ end
158
+
159
+ def barc_writer
160
+ bw = BARCWriter.new( barc_directory )
161
+ bw.do_compress = true
162
+ bw
163
+ end
164
+
165
+ def barc_directory
166
+ bdir = BARCDirectory.new( Java::java.io.File.new( "./barc" ) )
167
+ bdir
168
+ end
169
+
170
+ def page_updater
171
+ create_update_filter( keys( page_update_keys ), :page_post )
172
+ end
173
+
174
+ # Note: *_post is run possibly twice, once for both base content
175
+ # map and referer map.
176
+ def page_post
177
+ [ UHashMDCSetter.new,
178
+ barc_writer, # Not run in 302 referer case, since no SOURCE.
179
+ Prioritizer.new( "page-post",
180
+ :constant => 0,
181
+ :min_next => ( 30 * 60.0 ),
182
+ :visiting_now => true ),
183
+ last_visit_setter ]
184
+ end
185
+
186
+ def page_update_keys
187
+ [ :uhash, :host, :url, :type,
188
+ :ref_pub_date, :pub_date,
189
+ :priority, :last_visit, :next_visit_after,
190
+ :status, :etag, :reason, :referer, :referent,
191
+ :cache_file, :cache_file_offset, :simhash ]
192
+ end
193
+
194
+ def last_visit_setter
195
+ Copier.new( *keys( :visit_start, :last_visit ) )
196
+ end
197
+
198
+ end
199
+
200
+ end
201
+ end
@@ -0,0 +1,152 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # Copyright (c) 2008-2011 David Kellum
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
6
+ # may not use this file except in compliance with the License. You may
7
+ # obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14
+ # implied. See the License for the specific language governing
15
+ # permissions and limitations under the License.
16
+ #++
17
+
18
+ module Iudex
19
+ module Worker
20
+
21
+ class Prioritizer < Iudex::Filter::FilterBase
22
+ include Math
23
+
24
+ attr_accessor :constant
25
+ attr_accessor :impedance
26
+ attr_accessor :min_next
27
+ attr_accessor :min_next_unmodified
28
+ attr_accessor :factors
29
+ attr_accessor :visiting_now
30
+
31
+ WWW_BEGINS = Time.utc( 1991, "aug", 6, 20,0,0 ) # WWW begins
32
+ MINUTE = 60.0
33
+ HOUR = 60.0 * 60.0
34
+
35
+ def initialize( name, opts = {} )
36
+ @name = name
37
+
38
+ @constant = 0.0
39
+ @impedance = 2.0
40
+ @min_next_unmodified = 5 * MINUTE
41
+ @min_next = 10 * MINUTE
42
+ @visiting_now = false
43
+
44
+ @factors = [ [ 30.0, :ref_change_rate ],
45
+ [ -1.0, :log_pub_age ] ]
46
+
47
+ @log = RJack::SLF4J[ self.class ]
48
+
49
+ opts.each { |k,v| send( k.to_s + '=', v ) }
50
+ yield self if block_given?
51
+
52
+ @min_next_unmodified = [ @min_next_unmodified, @min_next ].min
53
+ @constant = @constant.to_f
54
+ end
55
+
56
+ def describe
57
+ [ @name, @constant, @min_next ]
58
+ end
59
+
60
+ def filter( map )
61
+
62
+ map.priority, delta = adjust( map, map.priority )
63
+
64
+ map.next_visit_after = ( as_time( map.visit_start ) + delta if delta )
65
+
66
+ true
67
+ end
68
+
69
+ def adjust( map, priority, delta = 0.0 )
70
+
71
+ old_priority = priority
72
+ memo = ( ( ( @constant != 0.0 ) ? [ @constant ] : [] ) if @log.debug? )
73
+
74
+ new_priority = @factors.inject( @constant ) do | p, (w,func)|
75
+ comp = ( w * send( func, map ) )
76
+ ( memo << "%.1f:%s" % [ comp.to_f, func ] ) if memo && comp != 0.0
77
+ p + comp
78
+ end
79
+
80
+ #FIXME: new_priority = [ 0.0, new_priority ].max
81
+
82
+ priority = ( ( ( priority || 0.0 ) * @impedance + new_priority ) /
83
+ ( @impedance + 1 ) )
84
+
85
+ if map.last_visit || visiting_now
86
+ delta = ( map.status == 304 ) ? @min_next_unmodified : @min_next
87
+ else
88
+ delta = 0.0
89
+ end
90
+
91
+ @log.debug do
92
+ memo.join( ' + ' ) +
93
+ ( " :: %.1f -> %.1f = %.1f in %.1fs" %
94
+ ( [ old_priority, new_priority,
95
+ priority, delta ].map { |f| f.to_f } ) )
96
+ end
97
+
98
+ [ priority, delta ]
99
+ end
100
+
101
+ def log_pub_age( map )
102
+ diff = sdiff( ( map.pub_date || WWW_BEGINS ), map.visit_start ) / MINUTE
103
+ diff = 1.0 / MINUTE if diff < 1.0 / MINUTE
104
+ ( log( diff ) - log( 1.0 / MINUTE ) )
105
+ end
106
+
107
+ # FIXME: Useful?
108
+ # def ref_pub_age( map )
109
+ # map.visit_start - ( map.ref_pub_date || WWW_BEGINS )
110
+ # end
111
+
112
+ # References per hour, with updates rated at 1/4 a new reference.
113
+ def ref_change_rate( map )
114
+ s = since( map )
115
+ if s.nil? || s == 0.0
116
+ 0.0
117
+ else
118
+ ( ( ( map.new_references || 0.0 ) +
119
+ ( map.updated_references || 0.0 ) / 4.0 ) /
120
+ s *
121
+ HOUR )
122
+ end
123
+ end
124
+
125
+ def since( map )
126
+ sdiff( map.last_visit || oldest( map.references ),
127
+ map.visit_start )
128
+ end
129
+
130
+ def oldest( refs )
131
+ ( refs.map { |r| r.pub_date }.compact.min ) if refs
132
+ end
133
+
134
+ def sdiff( prev, now )
135
+ diff = as_time( now ) - as_time( prev || WWW_BEGINS )
136
+ ( diff < 0.0 ) ? 0.0 : diff
137
+ end
138
+
139
+ # FIXME: Generalize?
140
+ def as_time( torj )
141
+ if torj.is_a?( Time )
142
+ torj
143
+ else
144
+ ms = torj.time
145
+ Time.at( ms / 1_000, ( ms % 1_000 ) * 1_000 ) # s, µs
146
+ end
147
+ end
148
+
149
+ end
150
+
151
+ end
152
+ end
@@ -0,0 +1,20 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-core'
18
+
19
+ require 'iudex-worker/base'
20
+ require 'iudex-worker/agent'
data/test/setup.rb ADDED
@@ -0,0 +1,34 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ #### General test setup: LOAD_PATH, logging, console output ####
18
+
19
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
20
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
21
+
22
+ require 'rubygems'
23
+ require 'rjack-logback'
24
+ RJack::Logback.config_console( :stderr => true )
25
+
26
+ require 'minitest/unit'
27
+ require 'minitest/autorun'
28
+
29
+ # Make test output logging compatible: no partial lines.
30
+ class TestOut
31
+ def print( *a ); $stdout.puts( *a ); end
32
+ def puts( *a ); $stdout.puts( *a ); end
33
+ end
34
+ MiniTest::Unit.output = TestOut.new
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-worker'
23
+
24
+ class TestAgent < MiniTest::Unit::TestCase
25
+ include Iudex::Worker
26
+ include RJack
27
+
28
+ def setup
29
+ Logback[ 'iudex.worker.FilterChainFactory' ].level = Logback::WARN
30
+ end
31
+
32
+ def teardown
33
+ Logback[ 'iudex.worker.FilterChainFactory' ].level = nil
34
+ Hooker.send( :clear )
35
+ end
36
+
37
+ def test_agent_default
38
+ assert_agent
39
+ end
40
+
41
+ def test_agent_with_sample_config
42
+ # Test out the sample config
43
+ Hooker.load_file( File.join( File.dirname( __FILE__ ),
44
+ '..', 'config', 'config.rb' ) )
45
+
46
+ assert_agent
47
+ end
48
+
49
+ def assert_agent
50
+
51
+ # Stub VisitExecutor.start to allow agent.run to return early.
52
+ Hooker.add( [ :iudex, :visit_executor ] ) do |vexec|
53
+ def vexec.start
54
+ #disable
55
+ end
56
+ end
57
+
58
+ agent = Agent.new
59
+ agent.run
60
+ pass
61
+
62
+ Hooker.check_not_applied do |*args|
63
+ flunk( "Hooks not applied: " + args.inspect )
64
+ end
65
+ pass
66
+
67
+ end
68
+
69
+ end
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
23
+
24
+ RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
25
+
26
+ require 'iudex-httpclient-3'
27
+
28
+ require 'iudex-da'
29
+ require 'iudex-da/pool_data_source_factory'
30
+
31
+ require 'iudex-worker'
32
+ require 'iudex-worker/filter_chain_factory'
33
+
34
+ class TestFilterChainFactory < MiniTest::Unit::TestCase
35
+ include Iudex
36
+ include Gravitext::HTMap
37
+
38
+ def test_filter
39
+ fcf = Worker::FilterChainFactory.new( "test" )
40
+
41
+ mgr = HTTPClient3.create_manager
42
+ mgr.start
43
+ fcf.http_client = HTTPClient3::HTTPClient3.new( mgr.client )
44
+
45
+ dsf = DA::PoolDataSourceFactory.new
46
+ fcf.data_source = dsf.create
47
+
48
+ fcf.filter do |chain|
49
+ # Run twice (assume new the first time, updates the second).
50
+ 2.times do
51
+ content = UniMap.new
52
+ content.url = Core::VisitURL.normalize( "http://gravitext.com/atom.xml" )
53
+ content.type = "FEED"
54
+ content.priority = 1.0
55
+ assert( chain.filter( content ) )
56
+ end
57
+ end
58
+
59
+ mgr.shutdown
60
+ dsf.close
61
+ end
62
+
63
+ end
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
23
+
24
+ RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
25
+
26
+ require 'iudex-worker'
27
+ require 'iudex-worker/prioritizer'
28
+
29
+ class TestPrioritizer < MiniTest::Unit::TestCase
30
+ include Iudex::Worker
31
+ include Gravitext::HTMap
32
+ JDate = Java::java.util.Date
33
+
34
+ UniMap.define_accessors
35
+
36
+ def test_identity
37
+ m = new_map
38
+ p = Prioritizer.new( "test", :constant => 3.2,
39
+ :factors => [], :impedance => 0 )
40
+
41
+ assert( p.filter( m ) )
42
+ assert_equal_fuzzy( 3.2, m.priority )
43
+ assert_equal( m.visit_start, m.next_visit_after )
44
+ end
45
+
46
+ def test_visiting_now
47
+ m = new_map
48
+ p = Prioritizer.new( "test", :visiting_now => true )
49
+
50
+ assert( p.filter( m ) )
51
+ assert_equal_fuzzy( m.visit_start.time/1000.0 + p.min_next,
52
+ m.next_visit_after.time/1000.0 )
53
+ end
54
+
55
+ def test_oldest
56
+ map = new_map
57
+
58
+ times = [ Time.utc( 2010, "jul", 17, 19,0,0 ),
59
+ oldest = Time.utc( 2010, "jul", 17, 18,0,0 ),
60
+ Time.utc( 2010, "jul", 17, 20,0,0 ),
61
+ nil ]
62
+
63
+ map.references = times.map do |t|
64
+ ref = UniMap.new
65
+ ref.pub_date = t
66
+ ref
67
+ end
68
+
69
+ p = prioritizer
70
+ assert_equal( oldest, p.as_time( p.oldest( map.references ) ) )
71
+ end
72
+
73
+ def test_since_last
74
+ assert_equal( 60.0, prioritizer.since( one_minute_last_map ) )
75
+ end
76
+
77
+ def test_ref_change_rate
78
+ map = one_minute_last_map
79
+ map.new_references = 1
80
+ map.updated_references = 4
81
+ assert_equal_fuzzy( 120, prioritizer.ref_change_rate( map ) )
82
+ end
83
+
84
+ def one_minute_last_map
85
+ map = new_map
86
+ map.visit_start = start = JDate.new
87
+ map.last_visit = JDate.new( start.time - ( 1_000 * 60 ) )
88
+ map
89
+ end
90
+
91
+ def assert_equal_fuzzy( l, r )
92
+ assert( ( l - r ).abs < 0.1, "#{l} ~!= #{r}" )
93
+ end
94
+
95
+ def new_map
96
+ map = UniMap.new
97
+ map.visit_start = JDate.new
98
+ map
99
+ end
100
+
101
+ def prioritizer
102
+ Prioritizer.new( "test" )
103
+ end
104
+
105
+ end
metadata ADDED
@@ -0,0 +1,182 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iudex-worker
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: java
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-04 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: iudex-core
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.0.0
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: rjack-logback
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: "1.0"
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: iudex-da
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ~>
45
+ - !ruby/object:Gem::Version
46
+ version: 1.0.0
47
+ type: :runtime
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: iudex-rome
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ~>
56
+ - !ruby/object:Gem::Version
57
+ version: 1.0.0
58
+ type: :runtime
59
+ version_requirements: *id004
60
+ - !ruby/object:Gem::Dependency
61
+ name: iudex-html
62
+ prerelease: false
63
+ requirement: &id005 !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 1.0.0
69
+ type: :runtime
70
+ version_requirements: *id005
71
+ - !ruby/object:Gem::Dependency
72
+ name: iudex-simhash
73
+ prerelease: false
74
+ requirement: &id006 !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ~>
78
+ - !ruby/object:Gem::Version
79
+ version: 1.0.0
80
+ type: :runtime
81
+ version_requirements: *id006
82
+ - !ruby/object:Gem::Dependency
83
+ name: iudex-httpclient-3
84
+ prerelease: false
85
+ requirement: &id007 !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ~>
89
+ - !ruby/object:Gem::Version
90
+ version: 1.0.0
91
+ type: :runtime
92
+ version_requirements: *id007
93
+ - !ruby/object:Gem::Dependency
94
+ name: minitest
95
+ prerelease: false
96
+ requirement: &id008 !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: 1.7.1
102
+ - - <
103
+ - !ruby/object:Gem::Version
104
+ version: "2.1"
105
+ type: :development
106
+ version_requirements: *id008
107
+ - !ruby/object:Gem::Dependency
108
+ name: rjack-tarpit
109
+ prerelease: false
110
+ requirement: &id009 !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ~>
114
+ - !ruby/object:Gem::Version
115
+ version: 1.3.0
116
+ type: :development
117
+ version_requirements: *id009
118
+ description: |-
119
+ Iudex is a general purpose web crawler and feed processor in
120
+ ruby/java. The iudex-worker gem provides a worker deamon for feed/page
121
+ processing.
122
+ email:
123
+ - dek-oss@gravitext.com
124
+ executables:
125
+ - iudex-worker-fg
126
+ extensions: []
127
+
128
+ extra_rdoc_files:
129
+ - Manifest.txt
130
+ - History.rdoc
131
+ - README.rdoc
132
+ files:
133
+ - History.rdoc
134
+ - Manifest.txt
135
+ - README.rdoc
136
+ - Rakefile
137
+ - bin/iudex-worker-fg
138
+ - config/config.rb
139
+ - init/iudex-worker
140
+ - lib/iudex-worker/base.rb
141
+ - lib/iudex-worker.rb
142
+ - lib/iudex-worker/agent.rb
143
+ - lib/iudex-worker/fetch_helper.rb
144
+ - lib/iudex-worker/filter_chain_factory.rb
145
+ - lib/iudex-worker/prioritizer.rb
146
+ - test/setup.rb
147
+ - test/test_agent.rb
148
+ - test/test_filter_chain_factory.rb
149
+ - test/test_prioritizer.rb
150
+ has_rdoc: true
151
+ homepage: http://github.com/dekellum/iudex
152
+ licenses: []
153
+
154
+ post_install_message:
155
+ rdoc_options:
156
+ - --main
157
+ - README.rdoc
158
+ require_paths:
159
+ - lib
160
+ required_ruby_version: !ruby/object:Gem::Requirement
161
+ none: false
162
+ requirements:
163
+ - - ">="
164
+ - !ruby/object:Gem::Version
165
+ version: "0"
166
+ required_rubygems_version: !ruby/object:Gem::Requirement
167
+ none: false
168
+ requirements:
169
+ - - ">="
170
+ - !ruby/object:Gem::Version
171
+ version: "0"
172
+ requirements: []
173
+
174
+ rubyforge_project: iudex-worker
175
+ rubygems_version: 1.5.1
176
+ signing_key:
177
+ specification_version: 3
178
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java
179
+ test_files:
180
+ - test/test_agent.rb
181
+ - test/test_filter_chain_factory.rb
182
+ - test/test_prioritizer.rb