iudex-da 1.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +2 -0
- data/Manifest.txt +32 -0
- data/README.rdoc +30 -0
- data/Rakefile +49 -0
- data/bin/iudex-da-generate-test-data +127 -0
- data/bin/iudex-da-import +62 -0
- data/bin/iudex-da-simhash-dump +112 -0
- data/bin/iudex-migrate +66 -0
- data/config/config.rb +14 -0
- data/db/0010_base_urls.rb +84 -0
- data/db/0020_add_feed_metadata.rb +37 -0
- data/db/0021_more_feed_text.rb +29 -0
- data/db/0030_add_priority.rb +28 -0
- data/db/0040_add_visit_after.rb +30 -0
- data/db/0050_add_cache_location.rb +32 -0
- data/db/0060_url_indexes.rb +41 -0
- data/db/0070_add_created_at.rb +28 -0
- data/db/0080_add_simhash.rb +33 -0
- data/lib/iudex-da.rb +40 -0
- data/lib/iudex-da/ar.rb +48 -0
- data/lib/iudex-da/base.rb +23 -0
- data/lib/iudex-da/config.rb +31 -0
- data/lib/iudex-da/factory_helper.rb +53 -0
- data/lib/iudex-da/importer.rb +91 -0
- data/lib/iudex-da/iudex-da-1.0.0.jar +0 -0
- data/lib/iudex-da/key_helper.rb +33 -0
- data/lib/iudex-da/pool_data_source_factory.rb +108 -0
- data/pom.xml +86 -0
- data/test/setup.rb +34 -0
- data/test/test_migrate.rb +41 -0
- data/test/test_poll_work.rb +132 -0
- data/test/test_pool_factory.rb +59 -0
- metadata +203 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You may
|
9
|
+
# obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
require 'iudex-da'
|
23
|
+
require 'iudex-da/ar'
|
24
|
+
|
25
|
+
class TestMigrate < MiniTest::Unit::TestCase
|
26
|
+
include Iudex::DA
|
27
|
+
include RJack
|
28
|
+
|
29
|
+
def test_up_down_up
|
30
|
+
Logback[ 'iudex.da.ActiveRecord' ].level = Logback::WARN
|
31
|
+
|
32
|
+
ActiveRecord::Migration.suppress_messages do
|
33
|
+
migrate
|
34
|
+
migrate( 0 )
|
35
|
+
migrate
|
36
|
+
end
|
37
|
+
|
38
|
+
Logback[ 'iudex.da.ActiveRecord' ].level = nil
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
require 'iudex-core'
|
23
|
+
require 'iudex-da/ar'
|
24
|
+
|
25
|
+
class TestPollWork < MiniTest::Unit::TestCase
|
26
|
+
include Iudex::DA
|
27
|
+
import 'iudex.core.VisitURL'
|
28
|
+
|
29
|
+
def setup
|
30
|
+
Url.delete_all
|
31
|
+
|
32
|
+
hosts = [ 'foo.org', 'other.net', 'gravitext.com', 'one.at' ]
|
33
|
+
count = 0
|
34
|
+
hosts.each do |host|
|
35
|
+
(5..15).each do |val|
|
36
|
+
url = Url.create! do |u|
|
37
|
+
u.priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
|
38
|
+
vurl = VisitURL.normalize( "http://#{host}/#{u.priority}" )
|
39
|
+
u.type = "FEED"
|
40
|
+
u.host = vurl.host
|
41
|
+
u.url = vurl.to_s
|
42
|
+
u.uhash = vurl.uhash
|
43
|
+
u.next_visit_after = Time.now
|
44
|
+
count += 1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def teardown
|
51
|
+
Url.delete_all
|
52
|
+
end
|
53
|
+
|
54
|
+
# Query to get new work, with limits on work per host, and total
|
55
|
+
# work (in descending piority order)
|
56
|
+
def test_poll
|
57
|
+
query = <<END
|
58
|
+
SELECT url, host, type, priority
|
59
|
+
FROM ( SELECT *, row_number() OVER ( ORDER BY priority DESC ) as ppos
|
60
|
+
FROM ( SELECT *, row_number() OVER ( PARTITION BY host
|
61
|
+
ORDER BY priority DESC ) AS hpos
|
62
|
+
FROM urls
|
63
|
+
WHERE next_visit_after <= now() ) AS subh
|
64
|
+
WHERE hpos <= ? ) AS subp
|
65
|
+
WHERE ppos <= ?
|
66
|
+
ORDER BY host, priority DESC;
|
67
|
+
END
|
68
|
+
res = Url.find_by_sql( [ query, 5, 18 ] )
|
69
|
+
|
70
|
+
def check_host_subset( byhost )
|
71
|
+
assert( byhost.length <= 5 )
|
72
|
+
byhost.each_cons(2) { |p,n| assert( p.priority >= n.priority ) }
|
73
|
+
end
|
74
|
+
|
75
|
+
assert( res.length <= 18 )
|
76
|
+
byhost = []
|
77
|
+
res.each do |u|
|
78
|
+
if byhost.empty? || byhost.last.host == u.host
|
79
|
+
byhost << u
|
80
|
+
else
|
81
|
+
check_host_subset( byhost )
|
82
|
+
byhost = []
|
83
|
+
end
|
84
|
+
end
|
85
|
+
check_host_subset( byhost ) unless byhost.empty?
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_insert
|
90
|
+
|
91
|
+
Url.transaction do
|
92
|
+
sql = <<END
|
93
|
+
CREATE TEMPORARY TABLE mod_urls
|
94
|
+
( uhash text,
|
95
|
+
url text,
|
96
|
+
host text );
|
97
|
+
END
|
98
|
+
# ON COMMIT DROP;
|
99
|
+
|
100
|
+
Url.connection.execute( sql ) #FIXME: auto-commit mode?
|
101
|
+
|
102
|
+
# Url.set_table_name "mod_urls"
|
103
|
+
|
104
|
+
count = ( 11 * 2 )
|
105
|
+
(5..20).each do |val|
|
106
|
+
# url = Url.create! do |u|
|
107
|
+
priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
|
108
|
+
# u.priority =
|
109
|
+
# u.type = "FEEDX"
|
110
|
+
vurl = VisitURL.normalize( "http://gravitext.com/#{priority}" )
|
111
|
+
|
112
|
+
sql = "INSERT into mod_urls VALUES ('%s','%s','%s')" %
|
113
|
+
[ vurl.uhash, vurl.to_s, vurl.host ]
|
114
|
+
Url.connection.execute( sql )
|
115
|
+
# u.next_visit_after = Time.now
|
116
|
+
count += 1
|
117
|
+
end
|
118
|
+
insert_query = <<END
|
119
|
+
INSERT INTO urls (uhash,url,host,type,priority)
|
120
|
+
( SELECT uhash,url,host,'FEEDX',4.78 FROM mod_urls
|
121
|
+
WHERE uhash NOT IN ( SELECT uhash FROM urls ) );
|
122
|
+
END
|
123
|
+
Url.connection.execute( insert_query )
|
124
|
+
|
125
|
+
Url.connection.execute( "DROP TABLE mod_urls;" )
|
126
|
+
|
127
|
+
# Url.set_table_name "urls"
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
20
|
+
|
21
|
+
require 'iudex-core'
|
22
|
+
require 'iudex-da/ar'
|
23
|
+
|
24
|
+
require 'iudex-da'
|
25
|
+
require 'iudex-da/pool_data_source_factory'
|
26
|
+
|
27
|
+
class TestPoolFactory < MiniTest::Unit::TestCase
|
28
|
+
include Iudex::DA
|
29
|
+
import 'org.apache.commons.dbutils.ResultSetHandler'
|
30
|
+
import 'org.apache.commons.dbutils.QueryRunner'
|
31
|
+
|
32
|
+
def setup
|
33
|
+
@factory = PoolDataSourceFactory.new( :loglevel => 2 )
|
34
|
+
@data_source = @factory.create
|
35
|
+
end
|
36
|
+
|
37
|
+
def teardown
|
38
|
+
@factory.close
|
39
|
+
@data_source = nil
|
40
|
+
end
|
41
|
+
|
42
|
+
class TestHandler
|
43
|
+
include ResultSetHandler
|
44
|
+
def handle( rs )
|
45
|
+
while rs.next
|
46
|
+
p [ rs.string( 'url' ) ]
|
47
|
+
end
|
48
|
+
nil
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_query
|
53
|
+
assert( ! @data_source.nil? )
|
54
|
+
qrun = QueryRunner.new( @data_source )
|
55
|
+
qrun.query( "SELECT url FROM urls WHERE uhash IN ('uRlU1h_YL-NvooSv2i98Rd3', 'notthere' );",
|
56
|
+
TestHandler.new )
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
metadata
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iudex-da
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-04-04 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: iudex-core
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.0.0
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: activerecord
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ~>
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 2.3.10
|
36
|
+
type: :runtime
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: jdbc-postgres
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 8.4.702
|
47
|
+
- - <
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "9.1"
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id003
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: activerecord-jdbcpostgresql-adapter
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ~>
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.1.0
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id004
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: rjack-commons-dbcp
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ~>
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: 1.4.0
|
72
|
+
type: :runtime
|
73
|
+
version_requirements: *id005
|
74
|
+
- !ruby/object:Gem::Dependency
|
75
|
+
name: rjack-commons-dbutils
|
76
|
+
prerelease: false
|
77
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.3.0
|
83
|
+
type: :runtime
|
84
|
+
version_requirements: *id006
|
85
|
+
- !ruby/object:Gem::Dependency
|
86
|
+
name: minitest
|
87
|
+
prerelease: false
|
88
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 1.7.1
|
94
|
+
- - <
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: "2.1"
|
97
|
+
type: :development
|
98
|
+
version_requirements: *id007
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: rjack-logback
|
101
|
+
prerelease: false
|
102
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
103
|
+
none: false
|
104
|
+
requirements:
|
105
|
+
- - ~>
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: "1.0"
|
108
|
+
type: :development
|
109
|
+
version_requirements: *id008
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: rjack-tarpit
|
112
|
+
prerelease: false
|
113
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
114
|
+
none: false
|
115
|
+
requirements:
|
116
|
+
- - ~>
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: 1.3.0
|
119
|
+
type: :development
|
120
|
+
version_requirements: *id009
|
121
|
+
description: |-
|
122
|
+
Iudex is a general purpose web crawler and feed processor in
|
123
|
+
ruby/java. The iudex-da gem provides a PostgreSQL-based content
|
124
|
+
meta-data store and work priority queue.
|
125
|
+
email:
|
126
|
+
- dek-oss@gravitext.com
|
127
|
+
executables:
|
128
|
+
- iudex-da-generate-test-data
|
129
|
+
- iudex-da-import
|
130
|
+
- iudex-da-simhash-dump
|
131
|
+
- iudex-migrate
|
132
|
+
extensions: []
|
133
|
+
|
134
|
+
extra_rdoc_files:
|
135
|
+
- Manifest.txt
|
136
|
+
- History.rdoc
|
137
|
+
- README.rdoc
|
138
|
+
files:
|
139
|
+
- History.rdoc
|
140
|
+
- Manifest.txt
|
141
|
+
- README.rdoc
|
142
|
+
- Rakefile
|
143
|
+
- pom.xml
|
144
|
+
- bin/iudex-da-generate-test-data
|
145
|
+
- bin/iudex-da-import
|
146
|
+
- bin/iudex-da-simhash-dump
|
147
|
+
- bin/iudex-migrate
|
148
|
+
- config/config.rb
|
149
|
+
- db/0010_base_urls.rb
|
150
|
+
- db/0020_add_feed_metadata.rb
|
151
|
+
- db/0021_more_feed_text.rb
|
152
|
+
- db/0030_add_priority.rb
|
153
|
+
- db/0040_add_visit_after.rb
|
154
|
+
- db/0050_add_cache_location.rb
|
155
|
+
- db/0060_url_indexes.rb
|
156
|
+
- db/0070_add_created_at.rb
|
157
|
+
- db/0080_add_simhash.rb
|
158
|
+
- lib/iudex-da/base.rb
|
159
|
+
- lib/iudex-da.rb
|
160
|
+
- lib/iudex-da/ar.rb
|
161
|
+
- lib/iudex-da/config.rb
|
162
|
+
- lib/iudex-da/factory_helper.rb
|
163
|
+
- lib/iudex-da/importer.rb
|
164
|
+
- lib/iudex-da/key_helper.rb
|
165
|
+
- lib/iudex-da/pool_data_source_factory.rb
|
166
|
+
- test/setup.rb
|
167
|
+
- test/test_migrate.rb
|
168
|
+
- test/test_poll_work.rb
|
169
|
+
- test/test_pool_factory.rb
|
170
|
+
- lib/iudex-da/iudex-da-1.0.0.jar
|
171
|
+
has_rdoc: true
|
172
|
+
homepage: http://github.com/dekellum/iudex
|
173
|
+
licenses: []
|
174
|
+
|
175
|
+
post_install_message:
|
176
|
+
rdoc_options:
|
177
|
+
- --main
|
178
|
+
- README.rdoc
|
179
|
+
require_paths:
|
180
|
+
- lib
|
181
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
182
|
+
none: false
|
183
|
+
requirements:
|
184
|
+
- - ">="
|
185
|
+
- !ruby/object:Gem::Version
|
186
|
+
version: "0"
|
187
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
188
|
+
none: false
|
189
|
+
requirements:
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: "0"
|
193
|
+
requirements: []
|
194
|
+
|
195
|
+
rubyforge_project: iudex-da
|
196
|
+
rubygems_version: 1.5.1
|
197
|
+
signing_key:
|
198
|
+
specification_version: 3
|
199
|
+
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|
200
|
+
test_files:
|
201
|
+
- test/test_migrate.rb
|
202
|
+
- test/test_poll_work.rb
|
203
|
+
- test/test_pool_factory.rb
|