iudex-da 1.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +2 -0
- data/Manifest.txt +32 -0
- data/README.rdoc +30 -0
- data/Rakefile +49 -0
- data/bin/iudex-da-generate-test-data +127 -0
- data/bin/iudex-da-import +62 -0
- data/bin/iudex-da-simhash-dump +112 -0
- data/bin/iudex-migrate +66 -0
- data/config/config.rb +14 -0
- data/db/0010_base_urls.rb +84 -0
- data/db/0020_add_feed_metadata.rb +37 -0
- data/db/0021_more_feed_text.rb +29 -0
- data/db/0030_add_priority.rb +28 -0
- data/db/0040_add_visit_after.rb +30 -0
- data/db/0050_add_cache_location.rb +32 -0
- data/db/0060_url_indexes.rb +41 -0
- data/db/0070_add_created_at.rb +28 -0
- data/db/0080_add_simhash.rb +33 -0
- data/lib/iudex-da.rb +40 -0
- data/lib/iudex-da/ar.rb +48 -0
- data/lib/iudex-da/base.rb +23 -0
- data/lib/iudex-da/config.rb +31 -0
- data/lib/iudex-da/factory_helper.rb +53 -0
- data/lib/iudex-da/importer.rb +91 -0
- data/lib/iudex-da/iudex-da-1.0.0.jar +0 -0
- data/lib/iudex-da/key_helper.rb +33 -0
- data/lib/iudex-da/pool_data_source_factory.rb +108 -0
- data/pom.xml +86 -0
- data/test/setup.rb +34 -0
- data/test/test_migrate.rb +41 -0
- data/test/test_poll_work.rb +132 -0
- data/test/test_pool_factory.rb +59 -0
- metadata +203 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You may
|
9
|
+
# obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
require 'iudex-da'
|
23
|
+
require 'iudex-da/ar'
|
24
|
+
|
25
|
+
class TestMigrate < MiniTest::Unit::TestCase
|
26
|
+
include Iudex::DA
|
27
|
+
include RJack
|
28
|
+
|
29
|
+
def test_up_down_up
|
30
|
+
Logback[ 'iudex.da.ActiveRecord' ].level = Logback::WARN
|
31
|
+
|
32
|
+
ActiveRecord::Migration.suppress_messages do
|
33
|
+
migrate
|
34
|
+
migrate( 0 )
|
35
|
+
migrate
|
36
|
+
end
|
37
|
+
|
38
|
+
Logback[ 'iudex.da.ActiveRecord' ].level = nil
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
require 'iudex-core'
|
23
|
+
require 'iudex-da/ar'
|
24
|
+
|
25
|
+
class TestPollWork < MiniTest::Unit::TestCase
|
26
|
+
include Iudex::DA
|
27
|
+
import 'iudex.core.VisitURL'
|
28
|
+
|
29
|
+
def setup
|
30
|
+
Url.delete_all
|
31
|
+
|
32
|
+
hosts = [ 'foo.org', 'other.net', 'gravitext.com', 'one.at' ]
|
33
|
+
count = 0
|
34
|
+
hosts.each do |host|
|
35
|
+
(5..15).each do |val|
|
36
|
+
url = Url.create! do |u|
|
37
|
+
u.priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
|
38
|
+
vurl = VisitURL.normalize( "http://#{host}/#{u.priority}" )
|
39
|
+
u.type = "FEED"
|
40
|
+
u.host = vurl.host
|
41
|
+
u.url = vurl.to_s
|
42
|
+
u.uhash = vurl.uhash
|
43
|
+
u.next_visit_after = Time.now
|
44
|
+
count += 1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def teardown
|
51
|
+
Url.delete_all
|
52
|
+
end
|
53
|
+
|
54
|
+
# Query to get new work, with limits on work per host, and total
|
55
|
+
# work (in descending piority order)
|
56
|
+
def test_poll
|
57
|
+
query = <<END
|
58
|
+
SELECT url, host, type, priority
|
59
|
+
FROM ( SELECT *, row_number() OVER ( ORDER BY priority DESC ) as ppos
|
60
|
+
FROM ( SELECT *, row_number() OVER ( PARTITION BY host
|
61
|
+
ORDER BY priority DESC ) AS hpos
|
62
|
+
FROM urls
|
63
|
+
WHERE next_visit_after <= now() ) AS subh
|
64
|
+
WHERE hpos <= ? ) AS subp
|
65
|
+
WHERE ppos <= ?
|
66
|
+
ORDER BY host, priority DESC;
|
67
|
+
END
|
68
|
+
res = Url.find_by_sql( [ query, 5, 18 ] )
|
69
|
+
|
70
|
+
def check_host_subset( byhost )
|
71
|
+
assert( byhost.length <= 5 )
|
72
|
+
byhost.each_cons(2) { |p,n| assert( p.priority >= n.priority ) }
|
73
|
+
end
|
74
|
+
|
75
|
+
assert( res.length <= 18 )
|
76
|
+
byhost = []
|
77
|
+
res.each do |u|
|
78
|
+
if byhost.empty? || byhost.last.host == u.host
|
79
|
+
byhost << u
|
80
|
+
else
|
81
|
+
check_host_subset( byhost )
|
82
|
+
byhost = []
|
83
|
+
end
|
84
|
+
end
|
85
|
+
check_host_subset( byhost ) unless byhost.empty?
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_insert
|
90
|
+
|
91
|
+
Url.transaction do
|
92
|
+
sql = <<END
|
93
|
+
CREATE TEMPORARY TABLE mod_urls
|
94
|
+
( uhash text,
|
95
|
+
url text,
|
96
|
+
host text );
|
97
|
+
END
|
98
|
+
# ON COMMIT DROP;
|
99
|
+
|
100
|
+
Url.connection.execute( sql ) #FIXME: auto-commit mode?
|
101
|
+
|
102
|
+
# Url.set_table_name "mod_urls"
|
103
|
+
|
104
|
+
count = ( 11 * 2 )
|
105
|
+
(5..20).each do |val|
|
106
|
+
# url = Url.create! do |u|
|
107
|
+
priority = ( val.to_f / 10.0 ) + (count.to_f / 50.0)
|
108
|
+
# u.priority =
|
109
|
+
# u.type = "FEEDX"
|
110
|
+
vurl = VisitURL.normalize( "http://gravitext.com/#{priority}" )
|
111
|
+
|
112
|
+
sql = "INSERT into mod_urls VALUES ('%s','%s','%s')" %
|
113
|
+
[ vurl.uhash, vurl.to_s, vurl.host ]
|
114
|
+
Url.connection.execute( sql )
|
115
|
+
# u.next_visit_after = Time.now
|
116
|
+
count += 1
|
117
|
+
end
|
118
|
+
insert_query = <<END
|
119
|
+
INSERT INTO urls (uhash,url,host,type,priority)
|
120
|
+
( SELECT uhash,url,host,'FEEDX',4.78 FROM mod_urls
|
121
|
+
WHERE uhash NOT IN ( SELECT uhash FROM urls ) );
|
122
|
+
END
|
123
|
+
Url.connection.execute( insert_query )
|
124
|
+
|
125
|
+
Url.connection.execute( "DROP TABLE mod_urls;" )
|
126
|
+
|
127
|
+
# Url.set_table_name "urls"
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2008-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
20
|
+
|
21
|
+
require 'iudex-core'
|
22
|
+
require 'iudex-da/ar'
|
23
|
+
|
24
|
+
require 'iudex-da'
|
25
|
+
require 'iudex-da/pool_data_source_factory'
|
26
|
+
|
27
|
+
class TestPoolFactory < MiniTest::Unit::TestCase
|
28
|
+
include Iudex::DA
|
29
|
+
import 'org.apache.commons.dbutils.ResultSetHandler'
|
30
|
+
import 'org.apache.commons.dbutils.QueryRunner'
|
31
|
+
|
32
|
+
def setup
|
33
|
+
@factory = PoolDataSourceFactory.new( :loglevel => 2 )
|
34
|
+
@data_source = @factory.create
|
35
|
+
end
|
36
|
+
|
37
|
+
def teardown
|
38
|
+
@factory.close
|
39
|
+
@data_source = nil
|
40
|
+
end
|
41
|
+
|
42
|
+
class TestHandler
|
43
|
+
include ResultSetHandler
|
44
|
+
def handle( rs )
|
45
|
+
while rs.next
|
46
|
+
p [ rs.string( 'url' ) ]
|
47
|
+
end
|
48
|
+
nil
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_query
|
53
|
+
assert( ! @data_source.nil? )
|
54
|
+
qrun = QueryRunner.new( @data_source )
|
55
|
+
qrun.query( "SELECT url FROM urls WHERE uhash IN ('uRlU1h_YL-NvooSv2i98Rd3', 'notthere' );",
|
56
|
+
TestHandler.new )
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
metadata
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iudex-da
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-04-04 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: iudex-core
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.0.0
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: activerecord
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ~>
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 2.3.10
|
36
|
+
type: :runtime
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: jdbc-postgres
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 8.4.702
|
47
|
+
- - <
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "9.1"
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id003
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: activerecord-jdbcpostgresql-adapter
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ~>
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.1.0
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id004
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: rjack-commons-dbcp
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ~>
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: 1.4.0
|
72
|
+
type: :runtime
|
73
|
+
version_requirements: *id005
|
74
|
+
- !ruby/object:Gem::Dependency
|
75
|
+
name: rjack-commons-dbutils
|
76
|
+
prerelease: false
|
77
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.3.0
|
83
|
+
type: :runtime
|
84
|
+
version_requirements: *id006
|
85
|
+
- !ruby/object:Gem::Dependency
|
86
|
+
name: minitest
|
87
|
+
prerelease: false
|
88
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 1.7.1
|
94
|
+
- - <
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: "2.1"
|
97
|
+
type: :development
|
98
|
+
version_requirements: *id007
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: rjack-logback
|
101
|
+
prerelease: false
|
102
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
103
|
+
none: false
|
104
|
+
requirements:
|
105
|
+
- - ~>
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: "1.0"
|
108
|
+
type: :development
|
109
|
+
version_requirements: *id008
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: rjack-tarpit
|
112
|
+
prerelease: false
|
113
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
114
|
+
none: false
|
115
|
+
requirements:
|
116
|
+
- - ~>
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: 1.3.0
|
119
|
+
type: :development
|
120
|
+
version_requirements: *id009
|
121
|
+
description: |-
|
122
|
+
Iudex is a general purpose web crawler and feed processor in
|
123
|
+
ruby/java. The iudex-da gem provides a PostgreSQL-based content
|
124
|
+
meta-data store and work priority queue.
|
125
|
+
email:
|
126
|
+
- dek-oss@gravitext.com
|
127
|
+
executables:
|
128
|
+
- iudex-da-generate-test-data
|
129
|
+
- iudex-da-import
|
130
|
+
- iudex-da-simhash-dump
|
131
|
+
- iudex-migrate
|
132
|
+
extensions: []
|
133
|
+
|
134
|
+
extra_rdoc_files:
|
135
|
+
- Manifest.txt
|
136
|
+
- History.rdoc
|
137
|
+
- README.rdoc
|
138
|
+
files:
|
139
|
+
- History.rdoc
|
140
|
+
- Manifest.txt
|
141
|
+
- README.rdoc
|
142
|
+
- Rakefile
|
143
|
+
- pom.xml
|
144
|
+
- bin/iudex-da-generate-test-data
|
145
|
+
- bin/iudex-da-import
|
146
|
+
- bin/iudex-da-simhash-dump
|
147
|
+
- bin/iudex-migrate
|
148
|
+
- config/config.rb
|
149
|
+
- db/0010_base_urls.rb
|
150
|
+
- db/0020_add_feed_metadata.rb
|
151
|
+
- db/0021_more_feed_text.rb
|
152
|
+
- db/0030_add_priority.rb
|
153
|
+
- db/0040_add_visit_after.rb
|
154
|
+
- db/0050_add_cache_location.rb
|
155
|
+
- db/0060_url_indexes.rb
|
156
|
+
- db/0070_add_created_at.rb
|
157
|
+
- db/0080_add_simhash.rb
|
158
|
+
- lib/iudex-da/base.rb
|
159
|
+
- lib/iudex-da.rb
|
160
|
+
- lib/iudex-da/ar.rb
|
161
|
+
- lib/iudex-da/config.rb
|
162
|
+
- lib/iudex-da/factory_helper.rb
|
163
|
+
- lib/iudex-da/importer.rb
|
164
|
+
- lib/iudex-da/key_helper.rb
|
165
|
+
- lib/iudex-da/pool_data_source_factory.rb
|
166
|
+
- test/setup.rb
|
167
|
+
- test/test_migrate.rb
|
168
|
+
- test/test_poll_work.rb
|
169
|
+
- test/test_pool_factory.rb
|
170
|
+
- lib/iudex-da/iudex-da-1.0.0.jar
|
171
|
+
has_rdoc: true
|
172
|
+
homepage: http://github.com/dekellum/iudex
|
173
|
+
licenses: []
|
174
|
+
|
175
|
+
post_install_message:
|
176
|
+
rdoc_options:
|
177
|
+
- --main
|
178
|
+
- README.rdoc
|
179
|
+
require_paths:
|
180
|
+
- lib
|
181
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
182
|
+
none: false
|
183
|
+
requirements:
|
184
|
+
- - ">="
|
185
|
+
- !ruby/object:Gem::Version
|
186
|
+
version: "0"
|
187
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
188
|
+
none: false
|
189
|
+
requirements:
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: "0"
|
193
|
+
requirements: []
|
194
|
+
|
195
|
+
rubyforge_project: iudex-da
|
196
|
+
rubygems_version: 1.5.1
|
197
|
+
signing_key:
|
198
|
+
specification_version: 3
|
199
|
+
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|
200
|
+
test_files:
|
201
|
+
- test/test_migrate.rb
|
202
|
+
- test/test_poll_work.rb
|
203
|
+
- test/test_pool_factory.rb
|