iudex-da 1.2.1-java → 1.3.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2012 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-da'
23
+ require 'iudex-da/models'
24
+
25
+ class TestUrlModel < MiniTest::Unit::TestCase
26
+ include Iudex::DA
27
+ include Iudex::DA::ORM
28
+
29
+ def setup
30
+ Url.truncate
31
+ end
32
+
33
+ def test_round_trip
34
+ urls = [ "http://foo.gravitext.com/bar/1",
35
+ "http://gravitext.com/2",
36
+ "http://hometown.com/33" ]
37
+
38
+ urls.each do | u, p |
39
+ Url.create( :visit_url => u, :type => "PAGE" )
40
+ end
41
+
42
+ assert_equal( 2, Url.where( :domain => 'gravitext.com' ).count )
43
+ assert_equal( 1, Url.where( :domain => 'hometown.com' ).count )
44
+
45
+ sample = Url.find_by_url( urls[ 0 ] )
46
+ assert_equal( urls[ 0 ], sample.url )
47
+ assert_equal( 'PAGE', sample.type )
48
+
49
+ refute( Url.find_by_url( "http://spunk" ) )
50
+ end
51
+
52
+ end
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2012 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-da'
23
+ require 'iudex-da/key_helper'
24
+ require 'iudex-da/pool_data_source_factory'
25
+ require 'iudex-da/models'
26
+
27
+ class TestWorkPoller < MiniTest::Unit::TestCase
28
+ include Iudex::Filter::KeyHelper
29
+ include Iudex::DA
30
+ include Iudex::DA::ORM
31
+
32
+ Gravitext::HTMap::UniMap.define_accessors
33
+
34
+ URLS = [ [ "http://foo.gravitext.com/bar/1", 11 ],
35
+ [ "http://hometown.com/33", 10 ],
36
+ [ "http://gravitext.com/2", 9 ] ]
37
+
38
+ def setup
39
+ Url.truncate
40
+
41
+ URLS.each do | u, p |
42
+ Url.create( :visit_url => u, :priority => p, :type => "PAGE" )
43
+ end
44
+
45
+ @factory = PoolDataSourceFactory.new( :loglevel => 4 )
46
+ @data_source = @factory.create
47
+ @mapper = ContentMapper.new( keys( :url, :type, :priority,
48
+ :next_visit_after ) )
49
+ @poller = WorkPoller.new( @data_source, @mapper )
50
+ end
51
+
52
+ def teardown
53
+ @factory.close
54
+ @date_source = nil
55
+ @poller = nil
56
+ end
57
+
58
+ attr_reader :poller
59
+
60
+ def test_default_poll
61
+ pos = 0
62
+ poller.poll.each do |map|
63
+ assert_equal( URLS[ pos ][ 0 ], map.url.url )
64
+ pos += 1
65
+ end
66
+ assert_equal( 3, pos )
67
+ end
68
+
69
+ def test_poll_with_max_priority_urls
70
+ poller.max_priority_urls = 4
71
+
72
+ pos = 0
73
+ poller.poll.each do |map|
74
+ assert_equal( URLS[ pos ][ 0 ], map.url.url )
75
+ pos += 1
76
+ end
77
+ assert_equal( 3, pos )
78
+ end
79
+
80
+ def test_poll_with_domain_depth
81
+ poller.domain_depth_coef = 0.125
82
+ poller.max_priority_urls = 4
83
+
84
+ pos = 0
85
+ poller.poll.each do |map|
86
+ assert_equal( URLS[ pos ][ 0 ], map.url.url )
87
+ pos += 1
88
+ end
89
+ assert_equal( 3, pos )
90
+ end
91
+
92
+ def test_poll_with_domain_depth_only
93
+ poller.domain_depth_coef = 0.125
94
+ poller.age_coef_1 = 0.0
95
+
96
+ pos = 0
97
+ poller.poll.each do |map|
98
+ assert_equal( URLS[ pos ][ 0 ], map.url.url )
99
+ pos += 1
100
+ end
101
+ assert_equal( 3, pos )
102
+ end
103
+
104
+ def test_poll_with_domain_group
105
+ poller.do_domain_group = true
106
+
107
+ urls = [ [ "http://foo.gravitext.com/bar/1", 11 ],
108
+ [ "http://gravitext.com/2", 9 ],
109
+ [ "http://hometown.com/33", 10 ] ]
110
+
111
+ pos = 0
112
+ poller.poll.each do |map|
113
+ assert_equal( urls[ pos ][ 0 ], map.url.url, "pos #{pos}" )
114
+ pos += 1
115
+ end
116
+ assert_equal( 3, pos )
117
+ end
118
+
119
+
120
+ def test_poll_domain_union_1
121
+ poller.domain_union = [ [ 'gravitext.com', 15000 ] ]
122
+
123
+ result = poller.poll
124
+ assert_equal( 2, result.size )
125
+ end
126
+
127
+ def test_poll_domain_union_2
128
+ poller.domain_union = [ [ 'gravitext.com', 15000 ],
129
+ [ nil, 10000 ] ]
130
+
131
+ result = poller.poll
132
+ assert_equal( 3, result.size )
133
+ end
134
+
135
+ def test_poll_domain_union_3
136
+ poller.domain_union = [ [ 'gravitext.com', 1 ],
137
+ [ 'hometown.com', 1 ],
138
+ [ nil, 3 ] ]
139
+
140
+ result = poller.poll
141
+ assert_equal( 2, result.size )
142
+ end
143
+
144
+ def test_poll_uhash_slice
145
+ poller.uhash_slice = [ 4, 5 ]
146
+
147
+ urls = [ [ "http://hometown.com/33", 10 ] ]
148
+
149
+ pos = 0
150
+ poller.poll.each do |map|
151
+ assert_equal( urls[ pos ][ 0 ], map.url.url, "pos #{pos}" )
152
+ pos += 1
153
+ end
154
+ assert_equal( 1, pos )
155
+ end
156
+
157
+ end
metadata CHANGED
@@ -1,199 +1,224 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: iudex-da
3
- version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 1.2.1
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.3.0
6
6
  platform: java
7
- authors:
8
- - David Kellum
9
- autorequire:
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2012-09-15 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: iudex-core
17
- version_requirements: &id001 !ruby/object:Gem::Requirement
18
- none: false
19
- requirements:
20
- - - ~>
21
- - !ruby/object:Gem::Version
22
- version: 1.2.1
23
- requirement: *id001
24
- prerelease: false
25
- type: :runtime
26
- - !ruby/object:Gem::Dependency
27
- name: activerecord
28
- version_requirements: &id002 !ruby/object:Gem::Requirement
29
- none: false
30
- requirements:
31
- - - ~>
32
- - !ruby/object:Gem::Version
33
- version: 3.1.3
34
- requirement: *id002
35
- prerelease: false
36
- type: :runtime
37
- - !ruby/object:Gem::Dependency
38
- name: jdbc-postgres
39
- version_requirements: &id003 !ruby/object:Gem::Requirement
40
- none: false
41
- requirements:
42
- - - ~>
43
- - !ruby/object:Gem::Version
44
- version: 9.1.901
45
- requirement: *id003
46
- prerelease: false
47
- type: :runtime
48
- - !ruby/object:Gem::Dependency
49
- name: activerecord-jdbcpostgresql-adapter
50
- version_requirements: &id004 !ruby/object:Gem::Requirement
51
- none: false
52
- requirements:
53
- - - ~>
54
- - !ruby/object:Gem::Version
55
- version: 1.2.2
56
- requirement: *id004
57
- prerelease: false
58
- type: :runtime
59
- - !ruby/object:Gem::Dependency
60
- name: rjack-commons-dbcp
61
- version_requirements: &id005 !ruby/object:Gem::Requirement
62
- none: false
63
- requirements:
64
- - - ~>
65
- - !ruby/object:Gem::Version
66
- version: 1.4.0
67
- requirement: *id005
68
- prerelease: false
69
- type: :runtime
70
- - !ruby/object:Gem::Dependency
71
- name: rjack-commons-dbutils
72
- version_requirements: &id006 !ruby/object:Gem::Requirement
73
- none: false
74
- requirements:
75
- - - ~>
76
- - !ruby/object:Gem::Version
77
- version: 1.4.0
78
- requirement: *id006
79
- prerelease: false
80
- type: :runtime
81
- - !ruby/object:Gem::Dependency
82
- name: minitest
83
- version_requirements: &id007 !ruby/object:Gem::Requirement
84
- none: false
85
- requirements:
86
- - - ~>
87
- - !ruby/object:Gem::Version
88
- version: "2.3"
89
- requirement: *id007
90
- prerelease: false
91
- type: :development
92
- - !ruby/object:Gem::Dependency
93
- name: rjack-logback
94
- version_requirements: &id008 !ruby/object:Gem::Requirement
95
- none: false
96
- requirements:
97
- - - ~>
98
- - !ruby/object:Gem::Version
99
- version: "1.2"
100
- requirement: *id008
101
- prerelease: false
102
- type: :development
103
- - !ruby/object:Gem::Dependency
104
- name: rjack-tarpit
105
- version_requirements: &id009 !ruby/object:Gem::Requirement
106
- none: false
107
- requirements:
108
- - - ~>
109
- - !ruby/object:Gem::Version
110
- version: "2.0"
111
- requirement: *id009
112
- prerelease: false
113
- type: :development
12
+ date: 2012-10-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: iudex-core
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ! '>='
19
+ - !ruby/object:Gem::Version
20
+ version: 1.2.1
21
+ - - <
22
+ - !ruby/object:Gem::Version
23
+ version: '1.4'
24
+ none: false
25
+ requirement: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.2.1
30
+ - - <
31
+ - !ruby/object:Gem::Version
32
+ version: '1.4'
33
+ none: false
34
+ prerelease: false
35
+ type: :runtime
36
+ - !ruby/object:Gem::Dependency
37
+ name: sequel
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ~>
41
+ - !ruby/object:Gem::Version
42
+ version: 3.40.0
43
+ none: false
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ~>
47
+ - !ruby/object:Gem::Version
48
+ version: 3.40.0
49
+ none: false
50
+ prerelease: false
51
+ type: :runtime
52
+ - !ruby/object:Gem::Dependency
53
+ name: jdbc-postgres
54
+ version_requirements: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ~>
57
+ - !ruby/object:Gem::Version
58
+ version: 9.1.901
59
+ none: false
60
+ requirement: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ~>
63
+ - !ruby/object:Gem::Version
64
+ version: 9.1.901
65
+ none: false
66
+ prerelease: false
67
+ type: :runtime
68
+ - !ruby/object:Gem::Dependency
69
+ name: rjack-commons-dbcp
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ~>
73
+ - !ruby/object:Gem::Version
74
+ version: 1.4.0
75
+ none: false
76
+ requirement: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ~>
79
+ - !ruby/object:Gem::Version
80
+ version: 1.4.0
81
+ none: false
82
+ prerelease: false
83
+ type: :runtime
84
+ - !ruby/object:Gem::Dependency
85
+ name: rjack-commons-dbutils
86
+ version_requirements: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ~>
89
+ - !ruby/object:Gem::Version
90
+ version: 1.4.0
91
+ none: false
92
+ requirement: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 1.4.0
97
+ none: false
98
+ prerelease: false
99
+ type: :runtime
100
+ - !ruby/object:Gem::Dependency
101
+ name: minitest
102
+ version_requirements: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ~>
105
+ - !ruby/object:Gem::Version
106
+ version: '2.3'
107
+ none: false
108
+ requirement: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ~>
111
+ - !ruby/object:Gem::Version
112
+ version: '2.3'
113
+ none: false
114
+ prerelease: false
115
+ type: :development
116
+ - !ruby/object:Gem::Dependency
117
+ name: rjack-logback
118
+ version_requirements: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ~>
121
+ - !ruby/object:Gem::Version
122
+ version: '1.2'
123
+ none: false
124
+ requirement: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ~>
127
+ - !ruby/object:Gem::Version
128
+ version: '1.2'
129
+ none: false
130
+ prerelease: false
131
+ type: :development
132
+ - !ruby/object:Gem::Dependency
133
+ name: rjack-tarpit
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ~>
137
+ - !ruby/object:Gem::Version
138
+ version: '2.0'
139
+ none: false
140
+ requirement: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ~>
143
+ - !ruby/object:Gem::Version
144
+ version: '2.0'
145
+ none: false
146
+ prerelease: false
147
+ type: :development
114
148
  description: Iudex is a general purpose web crawler and feed processor in ruby/java. The iudex-da gem provides a PostgreSQL-based content meta-data store and work priority queue.
115
- email:
116
- - dek-oss@gravitext.com
117
- executables:
118
- - iudex-da-generate-test-data
119
- - iudex-da-import
120
- - iudex-da-simhash-dump
121
- - iudex-migrate
149
+ email:
150
+ - dek-oss@gravitext.com
151
+ executables:
152
+ - iudex-da-generate-test-data
153
+ - iudex-da-import
154
+ - iudex-da-simhash-dump
155
+ - iudex-migrate
122
156
  extensions: []
123
-
124
- extra_rdoc_files:
125
- - History.rdoc
126
- - README.rdoc
127
- files:
128
- - History.rdoc
129
- - Manifest.txt
130
- - README.rdoc
131
- - Rakefile
132
- - pom.xml
133
- - bin/iudex-da-generate-test-data
134
- - bin/iudex-da-import
135
- - bin/iudex-da-simhash-dump
136
- - bin/iudex-migrate
137
- - config/config.rb
138
- - db/0010_base_urls.rb
139
- - db/0020_add_feed_metadata.rb
140
- - db/0021_more_feed_text.rb
141
- - db/0030_add_priority.rb
142
- - db/0040_add_visit_after.rb
143
- - db/0050_add_cache_location.rb
144
- - db/0060_url_indexes.rb
145
- - db/0070_add_created_at.rb
146
- - db/0080_add_simhash.rb
147
- - db/0081_remove_simhash_index.rb
148
- - db/0110_host_to_domain.rb
149
- - db/index_next_visit/0100_add_index_next_visit.rb
150
- - db/simhash/0085_add_simhash_index.rb
151
- - lib/iudex-da/base.rb
152
- - lib/iudex-da.rb
153
- - lib/iudex-da/ar.rb
154
- - lib/iudex-da/config.rb
155
- - lib/iudex-da/factory_helper.rb
156
- - lib/iudex-da/importer.rb
157
- - lib/iudex-da/key_helper.rb
158
- - lib/iudex-da/pool_data_source_factory.rb
159
- - test/setup.rb
160
- - test/test_migrate.rb
161
- - test/test_poll_work.rb
162
- - test/test_pool_factory.rb
163
- - lib/iudex-da/iudex-da-1.2.1.jar
157
+ extra_rdoc_files:
158
+ - History.rdoc
159
+ - README.rdoc
160
+ files:
161
+ - History.rdoc
162
+ - Manifest.txt
163
+ - README.rdoc
164
+ - Rakefile
165
+ - pom.xml
166
+ - bin/iudex-da-generate-test-data
167
+ - bin/iudex-da-import
168
+ - bin/iudex-da-simhash-dump
169
+ - bin/iudex-migrate
170
+ - config/config.rb
171
+ - db/20111012173757_base.rb
172
+ - db/20120930173600_uhash_collation_order.rb
173
+ - db/index_next_visit/21500000000101_add_index_next_visit.rb
174
+ - db/simhash/21500000000001_add_simhash_index.rb
175
+ - lib/iudex-da/base.rb
176
+ - lib/iudex-da.rb
177
+ - lib/iudex-da/config.rb
178
+ - lib/iudex-da/factory_helper.rb
179
+ - lib/iudex-da/importer.rb
180
+ - lib/iudex-da/key_helper.rb
181
+ - lib/iudex-da/models.rb
182
+ - lib/iudex-da/orm.rb
183
+ - lib/iudex-da/pool_data_source_factory.rb
184
+ - lib/iudex-da/work_poller.rb
185
+ - test/setup.rb
186
+ - test/test_migrate.rb
187
+ - test/test_pool_factory.rb
188
+ - test/test_url_model.rb
189
+ - test/test_work_poller.rb
190
+ - lib/iudex-da/iudex-da-1.3.0.jar
164
191
  homepage: http://iudex.gravitext.com
165
192
  licenses: []
166
-
167
- post_install_message:
168
- rdoc_options:
169
- - --main
170
- - README.rdoc
171
- require_paths:
172
- - lib
173
- required_ruby_version: !ruby/object:Gem::Requirement
193
+ post_install_message:
194
+ rdoc_options:
195
+ - --main
196
+ - README.rdoc
197
+ require_paths:
198
+ - lib
199
+ required_ruby_version: !ruby/object:Gem::Requirement
200
+ requirements:
201
+ - - ! '>='
202
+ - !ruby/object:Gem::Version
203
+ version: '0'
204
+ segments:
205
+ - 0
206
+ hash: 2
174
207
  none: false
175
- requirements:
176
- - - ">="
177
- - !ruby/object:Gem::Version
178
- hash: 2
179
- segments:
180
- - 0
181
- version: "0"
182
- required_rubygems_version: !ruby/object:Gem::Requirement
208
+ required_rubygems_version: !ruby/object:Gem::Requirement
209
+ requirements:
210
+ - - ! '>='
211
+ - !ruby/object:Gem::Version
212
+ version: '0'
213
+ segments:
214
+ - 0
215
+ hash: 2
183
216
  none: false
184
- requirements:
185
- - - ">="
186
- - !ruby/object:Gem::Version
187
- hash: 2
188
- segments:
189
- - 0
190
- version: "0"
191
217
  requirements: []
192
-
193
- rubyforge_project:
194
- rubygems_version: 1.8.15
195
- signing_key:
218
+ rubyforge_project:
219
+ rubygems_version: 1.8.24
220
+ signing_key:
196
221
  specification_version: 3
197
222
  summary: Iudex is a general purpose web crawler and feed processor in ruby/java.
198
223
  test_files: []
199
-
224
+ ...