iudex-worker 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,17 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ bin/iudex-worker-fg
6
+ config/config.rb
7
+ init/iudex-worker
8
+ lib/iudex-worker/base.rb
9
+ lib/iudex-worker.rb
10
+ lib/iudex-worker/agent.rb
11
+ lib/iudex-worker/fetch_helper.rb
12
+ lib/iudex-worker/filter_chain_factory.rb
13
+ lib/iudex-worker/prioritizer.rb
14
+ test/setup.rb
15
+ test/test_agent.rb
16
+ test/test_filter_chain_factory.rb
17
+ test/test_prioritizer.rb
data/README.rdoc ADDED
@@ -0,0 +1,25 @@
1
+ = iudex-worker
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-worker gem provides a worker deamon for feed/page
9
+ processing.
10
+
11
+ == License
12
+
13
+ Copyright (c) 2008-2011 David Kellum
14
+
15
+ Licensed under the Apache License, Version 2.0 (the "License"); you
16
+ may not use this file except in compliance with the License. You may
17
+ obtain a copy of the License at
18
+
19
+ http://www.apache.org/licenses/LICENSE-2.0
20
+
21
+ Unless required by applicable law or agreed to in writing, software
22
+ distributed under the License is distributed on an "AS IS" BASIS,
23
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24
+ implied. See the License for the specific language governing
25
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,41 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-worker/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-worker', Iudex::Worker::VERSION, :java_platform )
11
+
12
+ t.specify do |h|
13
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
14
+
15
+ h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
16
+ [ 'rjack-logback', '~> 1.0' ],
17
+ [ 'iudex-da', '~> 1.0.0' ],
18
+ [ 'iudex-rome', '~> 1.0.0' ],
19
+ [ 'iudex-html', '~> 1.0.0' ],
20
+ [ 'iudex-simhash', '~> 1.0.0' ],
21
+ [ 'iudex-httpclient-3', '~> 1.0.0' ] ]
22
+
23
+ h.testlib = :minitest
24
+ h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ] ]
25
+ end
26
+
27
+ task :chk_hist_vers do
28
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
29
+ end
30
+ task :chk_init_v do
31
+ t.test_line_match( 'init/iudex-worker', /^gem.+#{t.name}/, /= #{t.version}/ )
32
+ end
33
+ task :chk_hist_date do
34
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
35
+ end
36
+
37
+ task :gem => [ :chk_hist_vers, :chk_init_v ]
38
+ task :tag => [ :chk_hist_vers, :chk_init_v, :chk_hist_date ]
39
+ task :push => [ :chk_hist_date ]
40
+
41
+ t.define_tasks
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
21
+
22
+ require 'optparse'
23
+
24
+ module IudexBinScript
25
+
26
+ require 'rubygems'
27
+ require 'rjack-logback'
28
+
29
+ include RJack
30
+ Logback.config_console( :mdc => "uhash", :thread => true )
31
+
32
+ require 'iudex-worker'
33
+ include Iudex
34
+
35
+ Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
36
+
37
+ OptionParser.new do |opts|
38
+ opts.on( "-v", "--version", "Display version" ) do |file|
39
+ puts "iudex-worker: #{ Worker::VERSION }"
40
+ exit 1
41
+ end
42
+ opts.on( "-d", "--debug", "Enable verbose DEBUG logging" ) do |file|
43
+ Logback[ 'iudex' ].level = Logback::DEBUG
44
+ end
45
+ Hooker.register_config( opts )
46
+ end.parse!
47
+
48
+ Worker::Agent.new.run
49
+
50
+ end
data/config/config.rb ADDED
@@ -0,0 +1,46 @@
1
+
2
+ RJack::Logback[ 'iudex.filter.core.FilterChain.agent' ].level =
3
+ RJack::Logback::DEBUG
4
+
5
+ Iudex.configure do |c|
6
+
7
+ threads = 3
8
+
9
+ c.setup_connect_props do
10
+ { :ds_pool => { :max_active => threads / 3 * 2,
11
+ :max_idle => threads / 3 },
12
+ :loglevel => 1 }
13
+ end
14
+
15
+ c.setup_http_client_3 do |mgr|
16
+ mgr.manager_params.max_total_connections = threads * 10
17
+ end
18
+
19
+ c.setup_visit_executor do |vx|
20
+ vx.max_threads = threads
21
+ vx.min_host_delay = 100 #ms
22
+ end
23
+
24
+ c.setup_work_poller do |wp|
25
+ wp.min_order_remaining_ratio = 0.30
26
+ wp.max_check_interval = 100 #ms
27
+ wp.min_poll_interval = 2_000 #ms
28
+ end
29
+
30
+ c.setup_filter_factory do |ff|
31
+
32
+ def ff.barc_writer
33
+ bw = super
34
+ bw.do_compress = false
35
+ bw
36
+ end
37
+
38
+ def ff.barc_directory
39
+ bdir = super
40
+ bdir.target_length = 2 * ( 1024 ** 2 )
41
+ bdir
42
+ end
43
+
44
+ end
45
+
46
+ end
data/init/iudex-worker ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #. hashdot.profile += daemon
4
+ #. hashdot.pid_file = ./iudex-worker.pid
5
+ #. hashdot.io_redirect.file = ./iudex-worker.log
6
+ #. hashdot.vm.options += -Xmx1g
7
+ #. hashdot.vm.options += -XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled
8
+
9
+ #--
10
+ # Copyright (c) 2008-2011 David Kellum
11
+ #
12
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
13
+ # may not use this file except in compliance with the License. You
14
+ # may obtain a copy of the License at
15
+ #
16
+ # http://www.apache.org/licenses/LICENSE-2.0
17
+ #
18
+ # Unless required by applicable law or agreed to in writing, software
19
+ # distributed under the License is distributed on an "AS IS" BASIS,
20
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
21
+ # implied. See the License for the specific language governing
22
+ # permissions and limitations under the License.
23
+ #++
24
+
25
+ require 'rubygems'
26
+
27
+ gem( "iudex-worker", "= 1.0.0" )
28
+
29
+ module IudexInitScript
30
+
31
+ require 'rjack-logback'
32
+ include RJack
33
+ Logback.config_console( :full => true, :thread => true, :mdc => "uhash" )
34
+
35
+ require 'iudex-worker'
36
+ include Iudex
37
+
38
+ Hooker.log_with { |m| SLF4J[ 'iudex' ].info( m.rstrip ) }
39
+
40
+ if File.exist?( './config.rb' )
41
+ Hooker.load_file( './config.rb' )
42
+ end
43
+
44
+ Worker::Agent.new.run
45
+
46
+ end
@@ -0,0 +1,87 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-da'
18
+ require 'iudex-da/key_helper'
19
+ require 'iudex-da/pool_data_source_factory'
20
+
21
+ require 'iudex-httpclient-3'
22
+
23
+ require 'iudex-worker'
24
+ require 'iudex-worker/filter_chain_factory'
25
+
26
+ require 'hooker'
27
+
28
+ module Iudex
29
+ module Worker
30
+
31
+ class Agent
32
+ include Iudex::DA
33
+ include Iudex::Filter::KeyHelper
34
+ include Iudex::Core
35
+ include Iudex::Worker
36
+ include Gravitext::HTMap
37
+
38
+ def initialize
39
+ Hooker.apply( [ :iudex, :worker ], self )
40
+ end
41
+
42
+ def poll_keys
43
+ [ :url, :type, :priority, :next_visit_after, :last_visit, :etag ]
44
+ end
45
+
46
+ # Note this can/is used to override factory in derived classes.
47
+ def filter_chain_factory
48
+ FilterChainFactory.new( 'agent' )
49
+ end
50
+
51
+ def run
52
+ Hooker.with( :iudex ) do
53
+ dsf = PoolDataSourceFactory.new
54
+ data_source = dsf.create
55
+
56
+ cmapper = ContentMapper.new( keys( poll_keys ) )
57
+ wpoller = WorkPoller.new( data_source, cmapper )
58
+ Hooker.apply( :work_poller, wpoller )
59
+
60
+ mgr = HTTPClient3.create_manager
61
+ mgr.start
62
+ http_client = HTTPClient3::HTTPClient3.new( mgr.client )
63
+
64
+ fcf = filter_chain_factory
65
+ fcf.http_client = http_client
66
+ fcf.data_source = data_source
67
+
68
+ Hooker.apply( :filter_factory, fcf )
69
+
70
+ fcf.filter do |chain|
71
+ vexec = VisitExecutor.new( chain, wpoller )
72
+ Hooker.apply( :visit_executor, vexec )
73
+
74
+ Hooker.log_not_applied # All hooks should be used by now
75
+
76
+ vexec.start
77
+ vexec.join #Run until interrupted
78
+ end # fcf closes
79
+
80
+ mgr.shutdown
81
+ dsf.close
82
+ end
83
+ end
84
+ end
85
+
86
+ end
87
+ end
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module Worker
19
+ VERSION = '1.0.0'
20
+ end
21
+ end
@@ -0,0 +1,79 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-worker'
18
+
19
+ module Iudex
20
+ module Worker
21
+
22
+ module FetchHelper
23
+ include Iudex::HTTP
24
+ include Iudex::Core::Filters
25
+
26
+ def create_content_fetcher( accept_types, receiver_sym )
27
+ cf = ContentFetcher.new( http_client, create_chain( receiver_sym ) )
28
+
29
+ alist = accept_list( accept_types )
30
+ cf.accepted_content_types = alist unless alist.include?( '*/*' )
31
+
32
+ headers = [ [ 'User-Agent', http_user_agent ],
33
+ [ 'Accept', accept_header( accept_types ) ] ]
34
+
35
+ cf.request_headers = headers.map { |kv| Header.new( *kv ) }
36
+
37
+ cf
38
+ end
39
+
40
+ def http_user_agent
41
+ ( "Mozilla/5.0 (compatible; " +
42
+ "Iudex #{Iudex::Worker::VERSION}; " +
43
+ "+http://gravitext.com/iudex)" )
44
+ end
45
+
46
+ def feed_mime_types
47
+ # List of accepted mime types grouped and order in descending
48
+ # order of preference.
49
+ [ %w[ application/atom+xml application/rss+xml ],
50
+ %w[ application/rdf+xml application/xml ],
51
+ %w[ text/xml ],
52
+ %w[ text/* ],
53
+ %w[ */* ] ]
54
+ end
55
+
56
+ def page_mime_types
57
+ [ %w[ application/xhtml+xml text/html ],
58
+ %w[ application/xml ],
59
+ %w[ text/* ] ]
60
+ end
61
+
62
+ def accept_header( types )
63
+ q = 1.0
64
+ ts = types.map do |tgrp|
65
+ tgrp = tgrp.map { |m| "#{m};q=#{q}" } if q < 1.0
66
+ q -= 0.1
67
+ tgrp
68
+ end
69
+ ts.flatten.join( ',' )
70
+ end
71
+
72
+ def accept_list( types )
73
+ types.flatten
74
+ end
75
+
76
+ end
77
+
78
+ end
79
+ end
@@ -0,0 +1,201 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-filter'
18
+ require 'iudex-filter/filter_chain_factory'
19
+
20
+ require 'iudex-barc'
21
+
22
+ require 'iudex-core'
23
+
24
+ require 'iudex-da'
25
+ require 'iudex-da/factory_helper'
26
+
27
+ require 'iudex-rome'
28
+
29
+ require 'iudex-html'
30
+ require 'iudex-html/factory_helper'
31
+
32
+ require 'iudex-simhash'
33
+ require 'iudex-simhash/factory_helper'
34
+
35
+ require 'iudex-worker'
36
+ require 'iudex-worker/fetch_helper'
37
+ require 'iudex-worker/prioritizer'
38
+
39
+ module Iudex
40
+ module Worker
41
+
42
+ class FilterChainFactory < Iudex::Filter::Core::FilterChainFactory
43
+ include Iudex::Filter::Core
44
+ include Iudex::BARC
45
+ include Iudex::Core
46
+ include Iudex::Core::Filters
47
+ include Iudex::ROME
48
+
49
+ include Iudex::DA::Filters::FactoryHelper
50
+ include Iudex::HTML::Filters::FactoryHelper
51
+ include Iudex::SimHash::Filters::FactoryHelper
52
+ include FetchHelper
53
+
54
+ attr_accessor :http_client
55
+ attr_accessor :data_source
56
+
57
+ def initialize( name )
58
+ super
59
+ setup_reporters
60
+ end
61
+
62
+ def setup_reporters
63
+ add_summary_reporter
64
+ add_by_filter_reporter
65
+ end
66
+
67
+ def filters
68
+ [ UHashMDCSetter.new,
69
+ DefaultFilter.new,
70
+ super,
71
+ type_switch ].flatten
72
+ end
73
+
74
+ def listeners
75
+ super + [ MDCUnsetter.new( "uhash" ) ]
76
+ end
77
+
78
+ def type_map
79
+ { "FEED" => feed_fetcher,
80
+ "PAGE" => page_fetcher }
81
+ end
82
+
83
+ def type_switch( tmap = type_map )
84
+ create_switch( :type.to_k, tmap )
85
+ end
86
+
87
+ def feed_fetcher
88
+ [ create_content_fetcher( feed_mime_types, :feed_receiver ) ]
89
+ end
90
+
91
+ def page_fetcher
92
+ [ create_content_fetcher( page_mime_types, :page_receiver ) ]
93
+ end
94
+
95
+ def feed_receiver
96
+ [ RomeFeedParser.new,
97
+ DefaultFilter.new,
98
+ DateChangeFilter.new( false ),
99
+ feed_updater ]
100
+ end
101
+
102
+ def feed_updater
103
+ create_update_filter( keys( feed_update_keys ),
104
+ :feed_post, :feed_ref_update, :feed_ref_new )
105
+ end
106
+
107
+ def feed_ref_new
108
+ [ UHashMDCSetter.new,
109
+ ref_common_cleanup,
110
+ Prioritizer.new( "feed-ref-new",
111
+ :constant => 50,
112
+ :min_next => 0.0 ) ].flatten
113
+ end
114
+
115
+ def feed_ref_update
116
+ [ UHashMDCSetter.new,
117
+ DateChangeFilter.new( true ),
118
+ ref_common_cleanup,
119
+ Prioritizer.new( "feed-ref-update",
120
+ :constant => 10,
121
+ :min_next => 0.0 ) ].flatten
122
+ end
123
+
124
+ # Note: *_post is run possibly twice, once for both base content
125
+ # map and referer map.
126
+ def feed_post
127
+ [ UHashMDCSetter.new,
128
+ ref_common_cleanup,
129
+ Prioritizer.new( "feed-post",
130
+ :constant => 30,
131
+ :visiting_now => true ),
132
+ last_visit_setter ].flatten
133
+ end
134
+
135
+ def ref_common_cleanup
136
+ [ ref_html_filters,
137
+ TextCtrlWSFilter.new( :title.to_k ),
138
+ FutureDateFilter.new( :pub_date.to_k ) ].flatten
139
+ end
140
+
141
+ def ref_html_filters
142
+ [ html_clean_filters( :title ),
143
+ html_clean_filters( :summary ),
144
+ html_clean_filters( :content ),
145
+ html_write_filter( :summary ),
146
+ html_write_filter( :content ) ].flatten
147
+ end
148
+
149
+ def feed_update_keys
150
+ page_update_keys + [ :title, :summary, :content ]
151
+ end
152
+
153
+ def page_receiver
154
+ [ html_clean_filters( :source ),
155
+ simhash_generator,
156
+ page_updater ].flatten
157
+ end
158
+
159
+ def barc_writer
160
+ bw = BARCWriter.new( barc_directory )
161
+ bw.do_compress = true
162
+ bw
163
+ end
164
+
165
+ def barc_directory
166
+ bdir = BARCDirectory.new( Java::java.io.File.new( "./barc" ) )
167
+ bdir
168
+ end
169
+
170
+ def page_updater
171
+ create_update_filter( keys( page_update_keys ), :page_post )
172
+ end
173
+
174
+ # Note: *_post is run possibly twice, once for both base content
175
+ # map and referer map.
176
+ def page_post
177
+ [ UHashMDCSetter.new,
178
+ barc_writer, # Not run in 302 referer case, since no SOURCE.
179
+ Prioritizer.new( "page-post",
180
+ :constant => 0,
181
+ :min_next => ( 30 * 60.0 ),
182
+ :visiting_now => true ),
183
+ last_visit_setter ]
184
+ end
185
+
186
+ def page_update_keys
187
+ [ :uhash, :host, :url, :type,
188
+ :ref_pub_date, :pub_date,
189
+ :priority, :last_visit, :next_visit_after,
190
+ :status, :etag, :reason, :referer, :referent,
191
+ :cache_file, :cache_file_offset, :simhash ]
192
+ end
193
+
194
+ def last_visit_setter
195
+ Copier.new( *keys( :visit_start, :last_visit ) )
196
+ end
197
+
198
+ end
199
+
200
+ end
201
+ end
@@ -0,0 +1,152 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # Copyright (c) 2008-2011 David Kellum
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
6
+ # may not use this file except in compliance with the License. You may
7
+ # obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14
+ # implied. See the License for the specific language governing
15
+ # permissions and limitations under the License.
16
+ #++
17
+
18
+ module Iudex
19
+ module Worker
20
+
21
+ class Prioritizer < Iudex::Filter::FilterBase
22
+ include Math
23
+
24
+ attr_accessor :constant
25
+ attr_accessor :impedance
26
+ attr_accessor :min_next
27
+ attr_accessor :min_next_unmodified
28
+ attr_accessor :factors
29
+ attr_accessor :visiting_now
30
+
31
+ WWW_BEGINS = Time.utc( 1991, "aug", 6, 20,0,0 ) # WWW begins
32
+ MINUTE = 60.0
33
+ HOUR = 60.0 * 60.0
34
+
35
+ def initialize( name, opts = {} )
36
+ @name = name
37
+
38
+ @constant = 0.0
39
+ @impedance = 2.0
40
+ @min_next_unmodified = 5 * MINUTE
41
+ @min_next = 10 * MINUTE
42
+ @visiting_now = false
43
+
44
+ @factors = [ [ 30.0, :ref_change_rate ],
45
+ [ -1.0, :log_pub_age ] ]
46
+
47
+ @log = RJack::SLF4J[ self.class ]
48
+
49
+ opts.each { |k,v| send( k.to_s + '=', v ) }
50
+ yield self if block_given?
51
+
52
+ @min_next_unmodified = [ @min_next_unmodified, @min_next ].min
53
+ @constant = @constant.to_f
54
+ end
55
+
56
+ def describe
57
+ [ @name, @constant, @min_next ]
58
+ end
59
+
60
+ def filter( map )
61
+
62
+ map.priority, delta = adjust( map, map.priority )
63
+
64
+ map.next_visit_after = ( as_time( map.visit_start ) + delta if delta )
65
+
66
+ true
67
+ end
68
+
69
+ def adjust( map, priority, delta = 0.0 )
70
+
71
+ old_priority = priority
72
+ memo = ( ( ( @constant != 0.0 ) ? [ @constant ] : [] ) if @log.debug? )
73
+
74
+ new_priority = @factors.inject( @constant ) do | p, (w,func)|
75
+ comp = ( w * send( func, map ) )
76
+ ( memo << "%.1f:%s" % [ comp.to_f, func ] ) if memo && comp != 0.0
77
+ p + comp
78
+ end
79
+
80
+ #FIXME: new_priority = [ 0.0, new_priority ].max
81
+
82
+ priority = ( ( ( priority || 0.0 ) * @impedance + new_priority ) /
83
+ ( @impedance + 1 ) )
84
+
85
+ if map.last_visit || visiting_now
86
+ delta = ( map.status == 304 ) ? @min_next_unmodified : @min_next
87
+ else
88
+ delta = 0.0
89
+ end
90
+
91
+ @log.debug do
92
+ memo.join( ' + ' ) +
93
+ ( " :: %.1f -> %.1f = %.1f in %.1fs" %
94
+ ( [ old_priority, new_priority,
95
+ priority, delta ].map { |f| f.to_f } ) )
96
+ end
97
+
98
+ [ priority, delta ]
99
+ end
100
+
101
+ def log_pub_age( map )
102
+ diff = sdiff( ( map.pub_date || WWW_BEGINS ), map.visit_start ) / MINUTE
103
+ diff = 1.0 / MINUTE if diff < 1.0 / MINUTE
104
+ ( log( diff ) - log( 1.0 / MINUTE ) )
105
+ end
106
+
107
+ # FIXME: Useful?
108
+ # def ref_pub_age( map )
109
+ # map.visit_start - ( map.ref_pub_date || WWW_BEGINS )
110
+ # end
111
+
112
+ # References per hour, with updates rated at 1/4 a new reference.
113
+ def ref_change_rate( map )
114
+ s = since( map )
115
+ if s.nil? || s == 0.0
116
+ 0.0
117
+ else
118
+ ( ( ( map.new_references || 0.0 ) +
119
+ ( map.updated_references || 0.0 ) / 4.0 ) /
120
+ s *
121
+ HOUR )
122
+ end
123
+ end
124
+
125
+ def since( map )
126
+ sdiff( map.last_visit || oldest( map.references ),
127
+ map.visit_start )
128
+ end
129
+
130
+ def oldest( refs )
131
+ ( refs.map { |r| r.pub_date }.compact.min ) if refs
132
+ end
133
+
134
+ def sdiff( prev, now )
135
+ diff = as_time( now ) - as_time( prev || WWW_BEGINS )
136
+ ( diff < 0.0 ) ? 0.0 : diff
137
+ end
138
+
139
+ # FIXME: Generalize?
140
+ def as_time( torj )
141
+ if torj.is_a?( Time )
142
+ torj
143
+ else
144
+ ms = torj.time
145
+ Time.at( ms / 1_000, ( ms % 1_000 ) * 1_000 ) # s, µs
146
+ end
147
+ end
148
+
149
+ end
150
+
151
+ end
152
+ end
@@ -0,0 +1,20 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-core'
18
+
19
+ require 'iudex-worker/base'
20
+ require 'iudex-worker/agent'
data/test/setup.rb ADDED
@@ -0,0 +1,34 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ #### General test setup: LOAD_PATH, logging, console output ####
18
+
19
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
20
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
21
+
22
+ require 'rubygems'
23
+ require 'rjack-logback'
24
+ RJack::Logback.config_console( :stderr => true )
25
+
26
+ require 'minitest/unit'
27
+ require 'minitest/autorun'
28
+
29
+ # Make test output logging compatible: no partial lines.
30
+ class TestOut
31
+ def print( *a ); $stdout.puts( *a ); end
32
+ def puts( *a ); $stdout.puts( *a ); end
33
+ end
34
+ MiniTest::Unit.output = TestOut.new
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-worker'
23
+
24
+ class TestAgent < MiniTest::Unit::TestCase
25
+ include Iudex::Worker
26
+ include RJack
27
+
28
+ def setup
29
+ Logback[ 'iudex.worker.FilterChainFactory' ].level = Logback::WARN
30
+ end
31
+
32
+ def teardown
33
+ Logback[ 'iudex.worker.FilterChainFactory' ].level = nil
34
+ Hooker.send( :clear )
35
+ end
36
+
37
+ def test_agent_default
38
+ assert_agent
39
+ end
40
+
41
+ def test_agent_with_sample_config
42
+ # Test out the sample config
43
+ Hooker.load_file( File.join( File.dirname( __FILE__ ),
44
+ '..', 'config', 'config.rb' ) )
45
+
46
+ assert_agent
47
+ end
48
+
49
+ def assert_agent
50
+
51
+ # Stub VisitExecutor.start to allow agent.run to return early.
52
+ Hooker.add( [ :iudex, :visit_executor ] ) do |vexec|
53
+ def vexec.start
54
+ #disable
55
+ end
56
+ end
57
+
58
+ agent = Agent.new
59
+ agent.run
60
+ pass
61
+
62
+ Hooker.check_not_applied do |*args|
63
+ flunk( "Hooks not applied: " + args.inspect )
64
+ end
65
+ pass
66
+
67
+ end
68
+
69
+ end
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
23
+
24
+ RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
25
+
26
+ require 'iudex-httpclient-3'
27
+
28
+ require 'iudex-da'
29
+ require 'iudex-da/pool_data_source_factory'
30
+
31
+ require 'iudex-worker'
32
+ require 'iudex-worker/filter_chain_factory'
33
+
34
+ class TestFilterChainFactory < MiniTest::Unit::TestCase
35
+ include Iudex
36
+ include Gravitext::HTMap
37
+
38
+ def test_filter
39
+ fcf = Worker::FilterChainFactory.new( "test" )
40
+
41
+ mgr = HTTPClient3.create_manager
42
+ mgr.start
43
+ fcf.http_client = HTTPClient3::HTTPClient3.new( mgr.client )
44
+
45
+ dsf = DA::PoolDataSourceFactory.new
46
+ fcf.data_source = dsf.create
47
+
48
+ fcf.filter do |chain|
49
+ # Run twice (assume new the first time, updates the second).
50
+ 2.times do
51
+ content = UniMap.new
52
+ content.url = Core::VisitURL.normalize( "http://gravitext.com/atom.xml" )
53
+ content.type = "FEED"
54
+ content.priority = 1.0
55
+ assert( chain.filter( content ) )
56
+ end
57
+ end
58
+
59
+ mgr.shutdown
60
+ dsf.close
61
+ end
62
+
63
+ end
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ RJack::Logback.config_console( :stderr => true, :mdc => "uhash" )
23
+
24
+ RJack::Logback[ 'iudex' ].level = RJack::Logback::DEBUG
25
+
26
+ require 'iudex-worker'
27
+ require 'iudex-worker/prioritizer'
28
+
29
+ class TestPrioritizer < MiniTest::Unit::TestCase
30
+ include Iudex::Worker
31
+ include Gravitext::HTMap
32
+ JDate = Java::java.util.Date
33
+
34
+ UniMap.define_accessors
35
+
36
+ def test_identity
37
+ m = new_map
38
+ p = Prioritizer.new( "test", :constant => 3.2,
39
+ :factors => [], :impedance => 0 )
40
+
41
+ assert( p.filter( m ) )
42
+ assert_equal_fuzzy( 3.2, m.priority )
43
+ assert_equal( m.visit_start, m.next_visit_after )
44
+ end
45
+
46
+ def test_visiting_now
47
+ m = new_map
48
+ p = Prioritizer.new( "test", :visiting_now => true )
49
+
50
+ assert( p.filter( m ) )
51
+ assert_equal_fuzzy( m.visit_start.time/1000.0 + p.min_next,
52
+ m.next_visit_after.time/1000.0 )
53
+ end
54
+
55
+ def test_oldest
56
+ map = new_map
57
+
58
+ times = [ Time.utc( 2010, "jul", 17, 19,0,0 ),
59
+ oldest = Time.utc( 2010, "jul", 17, 18,0,0 ),
60
+ Time.utc( 2010, "jul", 17, 20,0,0 ),
61
+ nil ]
62
+
63
+ map.references = times.map do |t|
64
+ ref = UniMap.new
65
+ ref.pub_date = t
66
+ ref
67
+ end
68
+
69
+ p = prioritizer
70
+ assert_equal( oldest, p.as_time( p.oldest( map.references ) ) )
71
+ end
72
+
73
+ def test_since_last
74
+ assert_equal( 60.0, prioritizer.since( one_minute_last_map ) )
75
+ end
76
+
77
+ def test_ref_change_rate
78
+ map = one_minute_last_map
79
+ map.new_references = 1
80
+ map.updated_references = 4
81
+ assert_equal_fuzzy( 120, prioritizer.ref_change_rate( map ) )
82
+ end
83
+
84
+ def one_minute_last_map
85
+ map = new_map
86
+ map.visit_start = start = JDate.new
87
+ map.last_visit = JDate.new( start.time - ( 1_000 * 60 ) )
88
+ map
89
+ end
90
+
91
+ def assert_equal_fuzzy( l, r )
92
+ assert( ( l - r ).abs < 0.1, "#{l} ~!= #{r}" )
93
+ end
94
+
95
+ def new_map
96
+ map = UniMap.new
97
+ map.visit_start = JDate.new
98
+ map
99
+ end
100
+
101
+ def prioritizer
102
+ Prioritizer.new( "test" )
103
+ end
104
+
105
+ end
metadata ADDED
@@ -0,0 +1,182 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iudex-worker
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: java
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-04 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: iudex-core
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.0.0
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: rjack-logback
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: "1.0"
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: iudex-da
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ~>
45
+ - !ruby/object:Gem::Version
46
+ version: 1.0.0
47
+ type: :runtime
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: iudex-rome
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ~>
56
+ - !ruby/object:Gem::Version
57
+ version: 1.0.0
58
+ type: :runtime
59
+ version_requirements: *id004
60
+ - !ruby/object:Gem::Dependency
61
+ name: iudex-html
62
+ prerelease: false
63
+ requirement: &id005 !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 1.0.0
69
+ type: :runtime
70
+ version_requirements: *id005
71
+ - !ruby/object:Gem::Dependency
72
+ name: iudex-simhash
73
+ prerelease: false
74
+ requirement: &id006 !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ~>
78
+ - !ruby/object:Gem::Version
79
+ version: 1.0.0
80
+ type: :runtime
81
+ version_requirements: *id006
82
+ - !ruby/object:Gem::Dependency
83
+ name: iudex-httpclient-3
84
+ prerelease: false
85
+ requirement: &id007 !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ~>
89
+ - !ruby/object:Gem::Version
90
+ version: 1.0.0
91
+ type: :runtime
92
+ version_requirements: *id007
93
+ - !ruby/object:Gem::Dependency
94
+ name: minitest
95
+ prerelease: false
96
+ requirement: &id008 !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: 1.7.1
102
+ - - <
103
+ - !ruby/object:Gem::Version
104
+ version: "2.1"
105
+ type: :development
106
+ version_requirements: *id008
107
+ - !ruby/object:Gem::Dependency
108
+ name: rjack-tarpit
109
+ prerelease: false
110
+ requirement: &id009 !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ~>
114
+ - !ruby/object:Gem::Version
115
+ version: 1.3.0
116
+ type: :development
117
+ version_requirements: *id009
118
+ description: |-
119
+ Iudex is a general purpose web crawler and feed processor in
120
+ ruby/java. The iudex-worker gem provides a worker deamon for feed/page
121
+ processing.
122
+ email:
123
+ - dek-oss@gravitext.com
124
+ executables:
125
+ - iudex-worker-fg
126
+ extensions: []
127
+
128
+ extra_rdoc_files:
129
+ - Manifest.txt
130
+ - History.rdoc
131
+ - README.rdoc
132
+ files:
133
+ - History.rdoc
134
+ - Manifest.txt
135
+ - README.rdoc
136
+ - Rakefile
137
+ - bin/iudex-worker-fg
138
+ - config/config.rb
139
+ - init/iudex-worker
140
+ - lib/iudex-worker/base.rb
141
+ - lib/iudex-worker.rb
142
+ - lib/iudex-worker/agent.rb
143
+ - lib/iudex-worker/fetch_helper.rb
144
+ - lib/iudex-worker/filter_chain_factory.rb
145
+ - lib/iudex-worker/prioritizer.rb
146
+ - test/setup.rb
147
+ - test/test_agent.rb
148
+ - test/test_filter_chain_factory.rb
149
+ - test/test_prioritizer.rb
150
+ has_rdoc: true
151
+ homepage: http://github.com/dekellum/iudex
152
+ licenses: []
153
+
154
+ post_install_message:
155
+ rdoc_options:
156
+ - --main
157
+ - README.rdoc
158
+ require_paths:
159
+ - lib
160
+ required_ruby_version: !ruby/object:Gem::Requirement
161
+ none: false
162
+ requirements:
163
+ - - ">="
164
+ - !ruby/object:Gem::Version
165
+ version: "0"
166
+ required_rubygems_version: !ruby/object:Gem::Requirement
167
+ none: false
168
+ requirements:
169
+ - - ">="
170
+ - !ruby/object:Gem::Version
171
+ version: "0"
172
+ requirements: []
173
+
174
+ rubyforge_project: iudex-worker
175
+ rubygems_version: 1.5.1
176
+ signing_key:
177
+ specification_version: 3
178
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java
179
+ test_files:
180
+ - test/test_agent.rb
181
+ - test/test_filter_chain_factory.rb
182
+ - test/test_prioritizer.rb