iudex-da 1.2.1-java → 1.3.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2012 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-da'
23
+ require 'iudex-da/models'
24
+
25
+ class TestUrlModel < MiniTest::Unit::TestCase
26
+ include Iudex::DA
27
+ include Iudex::DA::ORM
28
+
29
+ def setup
30
+ Url.truncate
31
+ end
32
+
33
+ def test_round_trip
34
+ urls = [ "http://foo.gravitext.com/bar/1",
35
+ "http://gravitext.com/2",
36
+ "http://hometown.com/33" ]
37
+
38
+ urls.each do | u, p |
39
+ Url.create( :visit_url => u, :type => "PAGE" )
40
+ end
41
+
42
+ assert_equal( 2, Url.where( :domain => 'gravitext.com' ).count )
43
+ assert_equal( 1, Url.where( :domain => 'hometown.com' ).count )
44
+
45
+ sample = Url.find_by_url( urls[ 0 ] )
46
+ assert_equal( urls[ 0 ], sample.url )
47
+ assert_equal( 'PAGE', sample.type )
48
+
49
+ refute( Url.find_by_url( "http://spunk" ) )
50
+ end
51
+
52
+ end
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2012 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-da'
23
+ require 'iudex-da/key_helper'
24
+ require 'iudex-da/pool_data_source_factory'
25
+ require 'iudex-da/models'
26
+
27
+ class TestWorkPoller < MiniTest::Unit::TestCase
28
+ include Iudex::Filter::KeyHelper
29
+ include Iudex::DA
30
+ include Iudex::DA::ORM
31
+
32
+ Gravitext::HTMap::UniMap.define_accessors
33
+
34
+ URLS = [ [ "http://foo.gravitext.com/bar/1", 11 ],
35
+ [ "http://hometown.com/33", 10 ],
36
+ [ "http://gravitext.com/2", 9 ] ]
37
+
38
+ def setup
39
+ Url.truncate
40
+
41
+ URLS.each do | u, p |
42
+ Url.create( :visit_url => u, :priority => p, :type => "PAGE" )
43
+ end
44
+
45
+ @factory = PoolDataSourceFactory.new( :loglevel => 4 )
46
+ @data_source = @factory.create
47
+ @mapper = ContentMapper.new( keys( :url, :type, :priority,
48
+ :next_visit_after ) )
49
+ @poller = WorkPoller.new( @data_source, @mapper )
50
+ end
51
+
52
+ def teardown
53
+ @factory.close
54
+ @date_source = nil
55
+ @poller = nil
56
+ end
57
+
58
+ attr_reader :poller
59
+
60
+ def test_default_poll
61
+ pos = 0
62
+ poller.poll.each do |map|
63
+ assert_equal( URLS[ pos ][ 0 ], map.url.url )
64
+ pos += 1
65
+ end
66
+ assert_equal( 3, pos )
67
+ end
68
+
69
+ def test_poll_with_max_priority_urls
70
+ poller.max_priority_urls = 4
71
+
72
+ pos = 0
73
+ poller.poll.each do |map|
74
+ assert_equal( URLS[ pos ][ 0 ], map.url.url )
75
+ pos += 1
76
+ end
77
+ assert_equal( 3, pos )
78
+ end
79
+
80
+ def test_poll_with_domain_depth
81
+ poller.domain_depth_coef = 0.125
82
+ poller.max_priority_urls = 4
83
+
84
+ pos = 0
85
+ poller.poll.each do |map|
86
+ assert_equal( URLS[ pos ][ 0 ], map.url.url )
87
+ pos += 1
88
+ end
89
+ assert_equal( 3, pos )
90
+ end
91
+
92
+ def test_poll_with_domain_depth_only
93
+ poller.domain_depth_coef = 0.125
94
+ poller.age_coef_1 = 0.0
95
+
96
+ pos = 0
97
+ poller.poll.each do |map|
98
+ assert_equal( URLS[ pos ][ 0 ], map.url.url )
99
+ pos += 1
100
+ end
101
+ assert_equal( 3, pos )
102
+ end
103
+
104
+ def test_poll_with_domain_group
105
+ poller.do_domain_group = true
106
+
107
+ urls = [ [ "http://foo.gravitext.com/bar/1", 11 ],
108
+ [ "http://gravitext.com/2", 9 ],
109
+ [ "http://hometown.com/33", 10 ] ]
110
+
111
+ pos = 0
112
+ poller.poll.each do |map|
113
+ assert_equal( urls[ pos ][ 0 ], map.url.url, "pos #{pos}" )
114
+ pos += 1
115
+ end
116
+ assert_equal( 3, pos )
117
+ end
118
+
119
+
120
+ def test_poll_domain_union_1
121
+ poller.domain_union = [ [ 'gravitext.com', 15000 ] ]
122
+
123
+ result = poller.poll
124
+ assert_equal( 2, result.size )
125
+ end
126
+
127
+ def test_poll_domain_union_2
128
+ poller.domain_union = [ [ 'gravitext.com', 15000 ],
129
+ [ nil, 10000 ] ]
130
+
131
+ result = poller.poll
132
+ assert_equal( 3, result.size )
133
+ end
134
+
135
+ def test_poll_domain_union_3
136
+ poller.domain_union = [ [ 'gravitext.com', 1 ],
137
+ [ 'hometown.com', 1 ],
138
+ [ nil, 3 ] ]
139
+
140
+ result = poller.poll
141
+ assert_equal( 2, result.size )
142
+ end
143
+
144
+ def test_poll_uhash_slice
145
+ poller.uhash_slice = [ 4, 5 ]
146
+
147
+ urls = [ [ "http://hometown.com/33", 10 ] ]
148
+
149
+ pos = 0
150
+ poller.poll.each do |map|
151
+ assert_equal( urls[ pos ][ 0 ], map.url.url, "pos #{pos}" )
152
+ pos += 1
153
+ end
154
+ assert_equal( 1, pos )
155
+ end
156
+
157
+ end
metadata CHANGED
@@ -1,199 +1,224 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: iudex-da
3
- version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 1.2.1
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.3.0
6
6
  platform: java
7
- authors:
8
- - David Kellum
9
- autorequire:
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2012-09-15 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: iudex-core
17
- version_requirements: &id001 !ruby/object:Gem::Requirement
18
- none: false
19
- requirements:
20
- - - ~>
21
- - !ruby/object:Gem::Version
22
- version: 1.2.1
23
- requirement: *id001
24
- prerelease: false
25
- type: :runtime
26
- - !ruby/object:Gem::Dependency
27
- name: activerecord
28
- version_requirements: &id002 !ruby/object:Gem::Requirement
29
- none: false
30
- requirements:
31
- - - ~>
32
- - !ruby/object:Gem::Version
33
- version: 3.1.3
34
- requirement: *id002
35
- prerelease: false
36
- type: :runtime
37
- - !ruby/object:Gem::Dependency
38
- name: jdbc-postgres
39
- version_requirements: &id003 !ruby/object:Gem::Requirement
40
- none: false
41
- requirements:
42
- - - ~>
43
- - !ruby/object:Gem::Version
44
- version: 9.1.901
45
- requirement: *id003
46
- prerelease: false
47
- type: :runtime
48
- - !ruby/object:Gem::Dependency
49
- name: activerecord-jdbcpostgresql-adapter
50
- version_requirements: &id004 !ruby/object:Gem::Requirement
51
- none: false
52
- requirements:
53
- - - ~>
54
- - !ruby/object:Gem::Version
55
- version: 1.2.2
56
- requirement: *id004
57
- prerelease: false
58
- type: :runtime
59
- - !ruby/object:Gem::Dependency
60
- name: rjack-commons-dbcp
61
- version_requirements: &id005 !ruby/object:Gem::Requirement
62
- none: false
63
- requirements:
64
- - - ~>
65
- - !ruby/object:Gem::Version
66
- version: 1.4.0
67
- requirement: *id005
68
- prerelease: false
69
- type: :runtime
70
- - !ruby/object:Gem::Dependency
71
- name: rjack-commons-dbutils
72
- version_requirements: &id006 !ruby/object:Gem::Requirement
73
- none: false
74
- requirements:
75
- - - ~>
76
- - !ruby/object:Gem::Version
77
- version: 1.4.0
78
- requirement: *id006
79
- prerelease: false
80
- type: :runtime
81
- - !ruby/object:Gem::Dependency
82
- name: minitest
83
- version_requirements: &id007 !ruby/object:Gem::Requirement
84
- none: false
85
- requirements:
86
- - - ~>
87
- - !ruby/object:Gem::Version
88
- version: "2.3"
89
- requirement: *id007
90
- prerelease: false
91
- type: :development
92
- - !ruby/object:Gem::Dependency
93
- name: rjack-logback
94
- version_requirements: &id008 !ruby/object:Gem::Requirement
95
- none: false
96
- requirements:
97
- - - ~>
98
- - !ruby/object:Gem::Version
99
- version: "1.2"
100
- requirement: *id008
101
- prerelease: false
102
- type: :development
103
- - !ruby/object:Gem::Dependency
104
- name: rjack-tarpit
105
- version_requirements: &id009 !ruby/object:Gem::Requirement
106
- none: false
107
- requirements:
108
- - - ~>
109
- - !ruby/object:Gem::Version
110
- version: "2.0"
111
- requirement: *id009
112
- prerelease: false
113
- type: :development
12
+ date: 2012-10-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: iudex-core
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ! '>='
19
+ - !ruby/object:Gem::Version
20
+ version: 1.2.1
21
+ - - <
22
+ - !ruby/object:Gem::Version
23
+ version: '1.4'
24
+ none: false
25
+ requirement: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.2.1
30
+ - - <
31
+ - !ruby/object:Gem::Version
32
+ version: '1.4'
33
+ none: false
34
+ prerelease: false
35
+ type: :runtime
36
+ - !ruby/object:Gem::Dependency
37
+ name: sequel
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ~>
41
+ - !ruby/object:Gem::Version
42
+ version: 3.40.0
43
+ none: false
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ~>
47
+ - !ruby/object:Gem::Version
48
+ version: 3.40.0
49
+ none: false
50
+ prerelease: false
51
+ type: :runtime
52
+ - !ruby/object:Gem::Dependency
53
+ name: jdbc-postgres
54
+ version_requirements: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ~>
57
+ - !ruby/object:Gem::Version
58
+ version: 9.1.901
59
+ none: false
60
+ requirement: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ~>
63
+ - !ruby/object:Gem::Version
64
+ version: 9.1.901
65
+ none: false
66
+ prerelease: false
67
+ type: :runtime
68
+ - !ruby/object:Gem::Dependency
69
+ name: rjack-commons-dbcp
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ~>
73
+ - !ruby/object:Gem::Version
74
+ version: 1.4.0
75
+ none: false
76
+ requirement: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ~>
79
+ - !ruby/object:Gem::Version
80
+ version: 1.4.0
81
+ none: false
82
+ prerelease: false
83
+ type: :runtime
84
+ - !ruby/object:Gem::Dependency
85
+ name: rjack-commons-dbutils
86
+ version_requirements: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ~>
89
+ - !ruby/object:Gem::Version
90
+ version: 1.4.0
91
+ none: false
92
+ requirement: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 1.4.0
97
+ none: false
98
+ prerelease: false
99
+ type: :runtime
100
+ - !ruby/object:Gem::Dependency
101
+ name: minitest
102
+ version_requirements: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ~>
105
+ - !ruby/object:Gem::Version
106
+ version: '2.3'
107
+ none: false
108
+ requirement: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ~>
111
+ - !ruby/object:Gem::Version
112
+ version: '2.3'
113
+ none: false
114
+ prerelease: false
115
+ type: :development
116
+ - !ruby/object:Gem::Dependency
117
+ name: rjack-logback
118
+ version_requirements: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ~>
121
+ - !ruby/object:Gem::Version
122
+ version: '1.2'
123
+ none: false
124
+ requirement: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ~>
127
+ - !ruby/object:Gem::Version
128
+ version: '1.2'
129
+ none: false
130
+ prerelease: false
131
+ type: :development
132
+ - !ruby/object:Gem::Dependency
133
+ name: rjack-tarpit
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ~>
137
+ - !ruby/object:Gem::Version
138
+ version: '2.0'
139
+ none: false
140
+ requirement: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ~>
143
+ - !ruby/object:Gem::Version
144
+ version: '2.0'
145
+ none: false
146
+ prerelease: false
147
+ type: :development
114
148
  description: Iudex is a general purpose web crawler and feed processor in ruby/java. The iudex-da gem provides a PostgreSQL-based content meta-data store and work priority queue.
115
- email:
116
- - dek-oss@gravitext.com
117
- executables:
118
- - iudex-da-generate-test-data
119
- - iudex-da-import
120
- - iudex-da-simhash-dump
121
- - iudex-migrate
149
+ email:
150
+ - dek-oss@gravitext.com
151
+ executables:
152
+ - iudex-da-generate-test-data
153
+ - iudex-da-import
154
+ - iudex-da-simhash-dump
155
+ - iudex-migrate
122
156
  extensions: []
123
-
124
- extra_rdoc_files:
125
- - History.rdoc
126
- - README.rdoc
127
- files:
128
- - History.rdoc
129
- - Manifest.txt
130
- - README.rdoc
131
- - Rakefile
132
- - pom.xml
133
- - bin/iudex-da-generate-test-data
134
- - bin/iudex-da-import
135
- - bin/iudex-da-simhash-dump
136
- - bin/iudex-migrate
137
- - config/config.rb
138
- - db/0010_base_urls.rb
139
- - db/0020_add_feed_metadata.rb
140
- - db/0021_more_feed_text.rb
141
- - db/0030_add_priority.rb
142
- - db/0040_add_visit_after.rb
143
- - db/0050_add_cache_location.rb
144
- - db/0060_url_indexes.rb
145
- - db/0070_add_created_at.rb
146
- - db/0080_add_simhash.rb
147
- - db/0081_remove_simhash_index.rb
148
- - db/0110_host_to_domain.rb
149
- - db/index_next_visit/0100_add_index_next_visit.rb
150
- - db/simhash/0085_add_simhash_index.rb
151
- - lib/iudex-da/base.rb
152
- - lib/iudex-da.rb
153
- - lib/iudex-da/ar.rb
154
- - lib/iudex-da/config.rb
155
- - lib/iudex-da/factory_helper.rb
156
- - lib/iudex-da/importer.rb
157
- - lib/iudex-da/key_helper.rb
158
- - lib/iudex-da/pool_data_source_factory.rb
159
- - test/setup.rb
160
- - test/test_migrate.rb
161
- - test/test_poll_work.rb
162
- - test/test_pool_factory.rb
163
- - lib/iudex-da/iudex-da-1.2.1.jar
157
+ extra_rdoc_files:
158
+ - History.rdoc
159
+ - README.rdoc
160
+ files:
161
+ - History.rdoc
162
+ - Manifest.txt
163
+ - README.rdoc
164
+ - Rakefile
165
+ - pom.xml
166
+ - bin/iudex-da-generate-test-data
167
+ - bin/iudex-da-import
168
+ - bin/iudex-da-simhash-dump
169
+ - bin/iudex-migrate
170
+ - config/config.rb
171
+ - db/20111012173757_base.rb
172
+ - db/20120930173600_uhash_collation_order.rb
173
+ - db/index_next_visit/21500000000101_add_index_next_visit.rb
174
+ - db/simhash/21500000000001_add_simhash_index.rb
175
+ - lib/iudex-da/base.rb
176
+ - lib/iudex-da.rb
177
+ - lib/iudex-da/config.rb
178
+ - lib/iudex-da/factory_helper.rb
179
+ - lib/iudex-da/importer.rb
180
+ - lib/iudex-da/key_helper.rb
181
+ - lib/iudex-da/models.rb
182
+ - lib/iudex-da/orm.rb
183
+ - lib/iudex-da/pool_data_source_factory.rb
184
+ - lib/iudex-da/work_poller.rb
185
+ - test/setup.rb
186
+ - test/test_migrate.rb
187
+ - test/test_pool_factory.rb
188
+ - test/test_url_model.rb
189
+ - test/test_work_poller.rb
190
+ - lib/iudex-da/iudex-da-1.3.0.jar
164
191
  homepage: http://iudex.gravitext.com
165
192
  licenses: []
166
-
167
- post_install_message:
168
- rdoc_options:
169
- - --main
170
- - README.rdoc
171
- require_paths:
172
- - lib
173
- required_ruby_version: !ruby/object:Gem::Requirement
193
+ post_install_message:
194
+ rdoc_options:
195
+ - --main
196
+ - README.rdoc
197
+ require_paths:
198
+ - lib
199
+ required_ruby_version: !ruby/object:Gem::Requirement
200
+ requirements:
201
+ - - ! '>='
202
+ - !ruby/object:Gem::Version
203
+ version: '0'
204
+ segments:
205
+ - 0
206
+ hash: 2
174
207
  none: false
175
- requirements:
176
- - - ">="
177
- - !ruby/object:Gem::Version
178
- hash: 2
179
- segments:
180
- - 0
181
- version: "0"
182
- required_rubygems_version: !ruby/object:Gem::Requirement
208
+ required_rubygems_version: !ruby/object:Gem::Requirement
209
+ requirements:
210
+ - - ! '>='
211
+ - !ruby/object:Gem::Version
212
+ version: '0'
213
+ segments:
214
+ - 0
215
+ hash: 2
183
216
  none: false
184
- requirements:
185
- - - ">="
186
- - !ruby/object:Gem::Version
187
- hash: 2
188
- segments:
189
- - 0
190
- version: "0"
191
217
  requirements: []
192
-
193
- rubyforge_project:
194
- rubygems_version: 1.8.15
195
- signing_key:
218
+ rubyforge_project:
219
+ rubygems_version: 1.8.24
220
+ signing_key:
196
221
  specification_version: 3
197
222
  summary: Iudex is a general purpose web crawler and feed processor in ruby/java.
198
223
  test_files: []
199
-
224
+ ...