iudex-da 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You may
9
+ # obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-da'
23
+ require 'iudex-da/ar'
24
+
25
+ class TestMigrate < MiniTest::Unit::TestCase
26
+ include Iudex::DA
27
+ include RJack
28
+
29
+ def test_up_down_up
30
+ Logback[ 'iudex.da.ActiveRecord' ].level = Logback::WARN
31
+
32
+ ActiveRecord::Migration.suppress_messages do
33
+ migrate
34
+ migrate( 0 )
35
+ migrate
36
+ end
37
+
38
+ Logback[ 'iudex.da.ActiveRecord' ].level = nil
39
+ end
40
+
41
+ end
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ require 'iudex-core'
23
+ require 'iudex-da/ar'
24
+
25
+ class TestPollWork < MiniTest::Unit::TestCase
26
+ include Iudex::DA
27
+ import 'iudex.core.VisitURL'
28
+
29
+ def setup
30
+ Url.delete_all
31
+
32
+ hosts = [ 'foo.org', 'other.net', 'gravitext.com', 'one.at' ]
33
+ count = 0
34
+ hosts.each do |host|
35
+ (5..15).each do |val|
36
+ url = Url.create! do |u|
37
+ u.priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
38
+ vurl = VisitURL.normalize( "http://#{host}/#{u.priority}" )
39
+ u.type = "FEED"
40
+ u.host = vurl.host
41
+ u.url = vurl.to_s
42
+ u.uhash = vurl.uhash
43
+ u.next_visit_after = Time.now
44
+ count += 1
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ def teardown
51
+ Url.delete_all
52
+ end
53
+
54
+ # Query to get new work, with limits on work per host, and total
55
+ # work (in descending piority order)
56
+ def test_poll
57
+ query = <<END
58
+ SELECT url, host, type, priority
59
+ FROM ( SELECT *, row_number() OVER ( ORDER BY priority DESC ) as ppos
60
+ FROM ( SELECT *, row_number() OVER ( PARTITION BY host
61
+ ORDER BY priority DESC ) AS hpos
62
+ FROM urls
63
+ WHERE next_visit_after <= now() ) AS subh
64
+ WHERE hpos <= ? ) AS subp
65
+ WHERE ppos <= ?
66
+ ORDER BY host, priority DESC;
67
+ END
68
+ res = Url.find_by_sql( [ query, 5, 18 ] )
69
+
70
+ def check_host_subset( byhost )
71
+ assert( byhost.length <= 5 )
72
+ byhost.each_cons(2) { |p,n| assert( p.priority >= n.priority ) }
73
+ end
74
+
75
+ assert( res.length <= 18 )
76
+ byhost = []
77
+ res.each do |u|
78
+ if byhost.empty? || byhost.last.host == u.host
79
+ byhost << u
80
+ else
81
+ check_host_subset( byhost )
82
+ byhost = []
83
+ end
84
+ end
85
+ check_host_subset( byhost ) unless byhost.empty?
86
+
87
+ end
88
+
89
+ def test_insert
90
+
91
+ Url.transaction do
92
+ sql = <<END
93
+ CREATE TEMPORARY TABLE mod_urls
94
+ ( uhash text,
95
+ url text,
96
+ host text );
97
+ END
98
+ # ON COMMIT DROP;
99
+
100
+ Url.connection.execute( sql ) #FIXME: auto-commit mode?
101
+
102
+ # Url.set_table_name "mod_urls"
103
+
104
+ count = ( 11 * 2 )
105
+ (5..20).each do |val|
106
+ # url = Url.create! do |u|
107
+ priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
108
+ # u.priority =
109
+ # u.type = "FEEDX"
110
+ vurl = VisitURL.normalize( "http://gravitext.com/#{priority}" )
111
+
112
+ sql = "INSERT into mod_urls VALUES ('%s','%s','%s')" %
113
+ [ vurl.uhash, vurl.to_s, vurl.host ]
114
+ Url.connection.execute( sql )
115
+ # u.next_visit_after = Time.now
116
+ count += 1
117
+ end
118
+ insert_query = <<END
119
+ INSERT INTO urls (uhash,url,host,type,priority)
120
+ ( SELECT uhash,url,host,'FEEDX',4.78 FROM mod_urls
121
+ WHERE uhash NOT IN ( SELECT uhash FROM urls ) );
122
+ END
123
+ Url.connection.execute( insert_query )
124
+
125
+ Url.connection.execute( "DROP TABLE mod_urls;" )
126
+
127
+ # Url.set_table_name "urls"
128
+ end
129
+
130
+ end
131
+
132
+ end
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2008-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+ require File.join( File.dirname( __FILE__ ), "setup" )
20
+
21
+ require 'iudex-core'
22
+ require 'iudex-da/ar'
23
+
24
+ require 'iudex-da'
25
+ require 'iudex-da/pool_data_source_factory'
26
+
27
+ class TestPoolFactory < MiniTest::Unit::TestCase
28
+ include Iudex::DA
29
+ import 'org.apache.commons.dbutils.ResultSetHandler'
30
+ import 'org.apache.commons.dbutils.QueryRunner'
31
+
32
+ def setup
33
+ @factory = PoolDataSourceFactory.new( :loglevel => 2 )
34
+ @data_source = @factory.create
35
+ end
36
+
37
+ def teardown
38
+ @factory.close
39
+ @data_source = nil
40
+ end
41
+
42
+ class TestHandler
43
+ include ResultSetHandler
44
+ def handle( rs )
45
+ while rs.next
46
+ p [ rs.string( 'url' ) ]
47
+ end
48
+ nil
49
+ end
50
+ end
51
+
52
+ def test_query
53
+ assert( ! @data_source.nil? )
54
+ qrun = QueryRunner.new( @data_source )
55
+ qrun.query( "SELECT url FROM urls WHERE uhash IN ('uRlU1h_YL-NvooSv2i98Rd3', 'notthere' );",
56
+ TestHandler.new )
57
+ end
58
+
59
+ end
metadata ADDED
@@ -0,0 +1,203 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iudex-da
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: java
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-04 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: iudex-core
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.0.0
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: activerecord
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: 2.3.10
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: jdbc-postgres
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 8.4.702
47
+ - - <
48
+ - !ruby/object:Gem::Version
49
+ version: "9.1"
50
+ type: :runtime
51
+ version_requirements: *id003
52
+ - !ruby/object:Gem::Dependency
53
+ name: activerecord-jdbcpostgresql-adapter
54
+ prerelease: false
55
+ requirement: &id004 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ~>
59
+ - !ruby/object:Gem::Version
60
+ version: 1.1.0
61
+ type: :runtime
62
+ version_requirements: *id004
63
+ - !ruby/object:Gem::Dependency
64
+ name: rjack-commons-dbcp
65
+ prerelease: false
66
+ requirement: &id005 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ~>
70
+ - !ruby/object:Gem::Version
71
+ version: 1.4.0
72
+ type: :runtime
73
+ version_requirements: *id005
74
+ - !ruby/object:Gem::Dependency
75
+ name: rjack-commons-dbutils
76
+ prerelease: false
77
+ requirement: &id006 !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: 1.3.0
83
+ type: :runtime
84
+ version_requirements: *id006
85
+ - !ruby/object:Gem::Dependency
86
+ name: minitest
87
+ prerelease: false
88
+ requirement: &id007 !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: 1.7.1
94
+ - - <
95
+ - !ruby/object:Gem::Version
96
+ version: "2.1"
97
+ type: :development
98
+ version_requirements: *id007
99
+ - !ruby/object:Gem::Dependency
100
+ name: rjack-logback
101
+ prerelease: false
102
+ requirement: &id008 !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ~>
106
+ - !ruby/object:Gem::Version
107
+ version: "1.0"
108
+ type: :development
109
+ version_requirements: *id008
110
+ - !ruby/object:Gem::Dependency
111
+ name: rjack-tarpit
112
+ prerelease: false
113
+ requirement: &id009 !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ~>
117
+ - !ruby/object:Gem::Version
118
+ version: 1.3.0
119
+ type: :development
120
+ version_requirements: *id009
121
+ description: |-
122
+ Iudex is a general purpose web crawler and feed processor in
123
+ ruby/java. The iudex-da gem provides a PostgreSQL-based content
124
+ meta-data store and work priority queue.
125
+ email:
126
+ - dek-oss@gravitext.com
127
+ executables:
128
+ - iudex-da-generate-test-data
129
+ - iudex-da-import
130
+ - iudex-da-simhash-dump
131
+ - iudex-migrate
132
+ extensions: []
133
+
134
+ extra_rdoc_files:
135
+ - Manifest.txt
136
+ - History.rdoc
137
+ - README.rdoc
138
+ files:
139
+ - History.rdoc
140
+ - Manifest.txt
141
+ - README.rdoc
142
+ - Rakefile
143
+ - pom.xml
144
+ - bin/iudex-da-generate-test-data
145
+ - bin/iudex-da-import
146
+ - bin/iudex-da-simhash-dump
147
+ - bin/iudex-migrate
148
+ - config/config.rb
149
+ - db/0010_base_urls.rb
150
+ - db/0020_add_feed_metadata.rb
151
+ - db/0021_more_feed_text.rb
152
+ - db/0030_add_priority.rb
153
+ - db/0040_add_visit_after.rb
154
+ - db/0050_add_cache_location.rb
155
+ - db/0060_url_indexes.rb
156
+ - db/0070_add_created_at.rb
157
+ - db/0080_add_simhash.rb
158
+ - lib/iudex-da/base.rb
159
+ - lib/iudex-da.rb
160
+ - lib/iudex-da/ar.rb
161
+ - lib/iudex-da/config.rb
162
+ - lib/iudex-da/factory_helper.rb
163
+ - lib/iudex-da/importer.rb
164
+ - lib/iudex-da/key_helper.rb
165
+ - lib/iudex-da/pool_data_source_factory.rb
166
+ - test/setup.rb
167
+ - test/test_migrate.rb
168
+ - test/test_poll_work.rb
169
+ - test/test_pool_factory.rb
170
+ - lib/iudex-da/iudex-da-1.0.0.jar
171
+ has_rdoc: true
172
+ homepage: http://github.com/dekellum/iudex
173
+ licenses: []
174
+
175
+ post_install_message:
176
+ rdoc_options:
177
+ - --main
178
+ - README.rdoc
179
+ require_paths:
180
+ - lib
181
+ required_ruby_version: !ruby/object:Gem::Requirement
182
+ none: false
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: "0"
187
+ required_rubygems_version: !ruby/object:Gem::Requirement
188
+ none: false
189
+ requirements:
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: "0"
193
+ requirements: []
194
+
195
+ rubyforge_project: iudex-da
196
+ rubygems_version: 1.5.1
197
+ signing_key:
198
+ specification_version: 3
199
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java
200
+ test_files:
201
+ - test/test_migrate.rb
202
+ - test/test_poll_work.rb
203
+ - test/test_pool_factory.rb